{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.04857712, "auxiliary_loss_mlp": 0.0199187, "balance_loss_clip": 2.39669204, "balance_loss_mlp": 1.67505896, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 40.11934686879225, "language_loss": 2.58061337, "learning_rate": 0.0, "loss": 1.90124202, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 12.615964412689209 }, { "auxiliary_loss_clip": 0.03217209, "auxiliary_loss_mlp": 0.01379494, "balance_loss_clip": 1.59208441, "balance_loss_mlp": 1.16548967, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 54.660974841893434, "language_loss": 1.89303315, "learning_rate": 5.021476677069823e-07, "loss": 1.93900037, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.5258588790893555 }, { "auxiliary_loss_clip": 0.03213671, "auxiliary_loss_mlp": 0.01349028, "balance_loss_clip": 1.60020781, "balance_loss_mlp": 1.14379764, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 40.482147465856336, "language_loss": 1.61610591, "learning_rate": 7.958852231401551e-07, "loss": 1.66173291, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.3406474590301514 }, { "auxiliary_loss_clip": 0.03232699, "auxiliary_loss_mlp": 0.01318564, "balance_loss_clip": 1.59900165, "balance_loss_mlp": 1.10885084, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 36.579217578925636, "language_loss": 1.64638352, "learning_rate": 1.0042953354139647e-06, "loss": 1.6918962, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.456038475036621 }, { "auxiliary_loss_clip": 0.03190038, "auxiliary_loss_mlp": 0.01345037, "balance_loss_clip": 1.58716583, "balance_loss_mlp": 1.13751745, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 54.9508034791524, "language_loss": 1.93431103, "learning_rate": 1.1659507774310057e-06, "loss": 1.97966182, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.6019227504730225 }, { "auxiliary_loss_clip": 0.03238536, "auxiliary_loss_mlp": 0.01345963, "balance_loss_clip": 1.60101831, "balance_loss_mlp": 1.14244902, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 45.126698464283194, "language_loss": 1.60975599, "learning_rate": 1.2980328908471373e-06, "loss": 1.65560091, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.7047126293182373 }, { "auxiliary_loss_clip": 0.03341887, "auxiliary_loss_mlp": 0.01596753, "balance_loss_clip": 1.70233011, "balance_loss_mlp": 1.43958735, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 4.610361848558148, "language_loss": 0.81500804, "learning_rate": 1.4097067265369432e-06, "loss": 0.86439443, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.134592056274414 }, { "auxiliary_loss_clip": 0.03219778, "auxiliary_loss_mlp": 0.01327512, "balance_loss_clip": 1.59819603, "balance_loss_mlp": 1.11570144, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 41.439896748159214, "language_loss": 1.58792031, "learning_rate": 1.506443003120947e-06, "loss": 1.63339305, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.6757330894470215 }, { "auxiliary_loss_clip": 0.03175941, "auxiliary_loss_mlp": 0.01310091, "balance_loss_clip": 1.58784723, "balance_loss_mlp": 1.10686302, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 18.15604455892974, "language_loss": 1.47693348, "learning_rate": 1.5917704462803102e-06, "loss": 1.52179372, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.6493964195251465 }, { "auxiliary_loss_clip": 0.03206275, "auxiliary_loss_mlp": 0.01350741, "balance_loss_clip": 1.59173989, "balance_loss_mlp": 1.14407992, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 13.062396004046407, "language_loss": 1.52989841, "learning_rate": 1.6680984451379884e-06, "loss": 1.57546866, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.645115613937378 }, { "auxiliary_loss_clip": 0.03193913, "auxiliary_loss_mlp": 0.01362597, "balance_loss_clip": 1.59084797, "balance_loss_mlp": 1.14592206, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 13.95555698780662, "language_loss": 1.32547235, "learning_rate": 1.7371455188905097e-06, "loss": 1.37103748, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.798292398452759 }, { "auxiliary_loss_clip": 0.03209931, "auxiliary_loss_mlp": 0.01358659, "balance_loss_clip": 1.59634089, "balance_loss_mlp": 1.14618027, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 10.668008037061085, "language_loss": 1.25325942, "learning_rate": 1.8001805585541196e-06, "loss": 1.29894519, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.798614501953125 }, { "auxiliary_loss_clip": 0.03196177, "auxiliary_loss_mlp": 0.01363921, "balance_loss_clip": 1.59832883, "balance_loss_mlp": 1.15449381, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.740215697457617, "language_loss": 1.29070807, "learning_rate": 1.8581671739548328e-06, "loss": 1.33630896, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.7212860584259033 }, { "auxiliary_loss_clip": 0.03197996, "auxiliary_loss_mlp": 0.01335406, "balance_loss_clip": 1.59409261, "balance_loss_mlp": 1.1274097, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 6.121092676105596, "language_loss": 1.13598537, "learning_rate": 1.9118543942439254e-06, "loss": 1.18131948, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 4.335437774658203 }, { "auxiliary_loss_clip": 0.03127148, "auxiliary_loss_mlp": 0.0134348, "balance_loss_clip": 1.58910632, "balance_loss_mlp": 1.13309956, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.904381974006851, "language_loss": 1.12619162, "learning_rate": 1.961836000571161e-06, "loss": 1.17089796, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 3.5974414348602295 }, { "auxiliary_loss_clip": 0.03198165, "auxiliary_loss_mlp": 0.0146863, "balance_loss_clip": 1.68115842, "balance_loss_mlp": 1.32062018, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.8014902663919754, "language_loss": 0.64689642, "learning_rate": 2.0085906708279293e-06, "loss": 0.69356436, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.1316792964935303 }, { "auxiliary_loss_clip": 0.03131644, "auxiliary_loss_mlp": 0.01352452, "balance_loss_clip": 1.58373582, "balance_loss_mlp": 1.14788878, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.8916894515364735, "language_loss": 1.15955639, "learning_rate": 2.0525099325728135e-06, "loss": 1.20439732, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 2.6377735137939453 }, { "auxiliary_loss_clip": 0.03155775, "auxiliary_loss_mlp": 0.01429185, "balance_loss_clip": 1.67447209, "balance_loss_mlp": 1.28422666, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.575649053011835, "language_loss": 0.72171211, "learning_rate": 2.0939181139872922e-06, "loss": 0.76756167, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.095980167388916 }, { "auxiliary_loss_clip": 0.03136447, "auxiliary_loss_mlp": 0.01294968, "balance_loss_clip": 1.58663404, "balance_loss_mlp": 1.0953635, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 4.756459959382579, "language_loss": 1.01624095, "learning_rate": 2.1330868934640175e-06, "loss": 1.0605551, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.7125473022460938 }, { "auxiliary_loss_clip": 0.03092702, "auxiliary_loss_mlp": 0.01394372, "balance_loss_clip": 1.66258645, "balance_loss_mlp": 1.25399137, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.5580436290505593, "language_loss": 0.76393998, "learning_rate": 2.170246112844971e-06, "loss": 0.80881071, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 2.8881995677948 }, { "auxiliary_loss_clip": 0.03062899, "auxiliary_loss_mlp": 0.0132019, "balance_loss_clip": 1.56706142, "balance_loss_mlp": 1.11658025, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 4.37472530942615, "language_loss": 1.01542091, "learning_rate": 2.2055919496770983e-06, "loss": 1.05925179, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.6456518173217773 }, { "auxiliary_loss_clip": 0.03024953, "auxiliary_loss_mlp": 0.01337104, "balance_loss_clip": 1.56464064, "balance_loss_mlp": 1.13454401, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 4.159550303111269, "language_loss": 0.89440864, "learning_rate": 2.2392931865974923e-06, "loss": 0.93802917, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 2.7724180221557617 }, { "auxiliary_loss_clip": 0.03026118, "auxiliary_loss_mlp": 0.01303982, "balance_loss_clip": 1.56200361, "balance_loss_mlp": 1.10208929, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 4.356844440513057, "language_loss": 1.01692307, "learning_rate": 2.271496085962064e-06, "loss": 1.06022406, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.6588985919952393 }, { "auxiliary_loss_clip": 0.02960346, "auxiliary_loss_mlp": 0.01295657, "balance_loss_clip": 1.55110884, "balance_loss_mlp": 1.09309649, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 3.7172172735020097, "language_loss": 1.02769744, "learning_rate": 2.3023282262611022e-06, "loss": 1.07025743, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.625957727432251 }, { "auxiliary_loss_clip": 0.02953816, "auxiliary_loss_mlp": 0.01285576, "balance_loss_clip": 1.55001831, "balance_loss_mlp": 1.09465003, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 3.718087701156514, "language_loss": 0.925192, "learning_rate": 2.3319015548620114e-06, "loss": 0.96758592, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 2.782104969024658 }, { "auxiliary_loss_clip": 0.02953851, "auxiliary_loss_mlp": 0.0130588, "balance_loss_clip": 1.54961729, "balance_loss_mlp": 1.10284245, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.1123687812523606, "language_loss": 0.9296093, "learning_rate": 2.3603148416618152e-06, "loss": 0.97220659, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.744178056716919 }, { "auxiliary_loss_clip": 0.02893513, "auxiliary_loss_mlp": 0.01279401, "balance_loss_clip": 1.54653335, "balance_loss_mlp": 1.08284855, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 2.3453359149862854, "language_loss": 1.00922859, "learning_rate": 2.3876556694204647e-06, "loss": 1.05095768, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.784576177597046 }, { "auxiliary_loss_clip": 0.02851188, "auxiliary_loss_mlp": 0.01291519, "balance_loss_clip": 1.53105366, "balance_loss_mlp": 1.09916306, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 2.8130831364628386, "language_loss": 0.90652895, "learning_rate": 2.414002061950908e-06, "loss": 0.94795597, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.7183852195739746 }, { "auxiliary_loss_clip": 0.02842421, "auxiliary_loss_mlp": 0.01286896, "balance_loss_clip": 1.53033519, "balance_loss_mlp": 1.0966382, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.321106806316836, "language_loss": 0.99701285, "learning_rate": 2.4394238264681557e-06, "loss": 1.03830612, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.7113656997680664 }, { "auxiliary_loss_clip": 0.02849551, "auxiliary_loss_mlp": 0.01251704, "balance_loss_clip": 1.53163314, "balance_loss_mlp": 1.07632351, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.23102532260135, "language_loss": 0.99556005, "learning_rate": 2.4639836682781433e-06, "loss": 1.03657258, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.733118772506714 }, { "auxiliary_loss_clip": 0.02800612, "auxiliary_loss_mlp": 0.01252605, "balance_loss_clip": 1.52106225, "balance_loss_mlp": 1.0782733, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 2.6031654197911154, "language_loss": 1.00234509, "learning_rate": 2.487738122623307e-06, "loss": 1.04287732, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.639840602874756 }, { "auxiliary_loss_clip": 0.02730956, "auxiliary_loss_mlp": 0.01269211, "balance_loss_clip": 1.50536776, "balance_loss_mlp": 1.0801928, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 2.5107703003494652, "language_loss": 0.98916757, "learning_rate": 2.510738338534912e-06, "loss": 1.0291692, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.7080843448638916 }, { "auxiliary_loss_clip": 0.02676956, "auxiliary_loss_mlp": 0.01250241, "balance_loss_clip": 1.48908234, "balance_loss_mlp": 1.07133174, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.6116982395605985, "language_loss": 1.02511954, "learning_rate": 2.5330307420306648e-06, "loss": 1.06439161, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.6340301036834717 }, { "auxiliary_loss_clip": 0.02633222, "auxiliary_loss_mlp": 0.0126895, "balance_loss_clip": 1.47956562, "balance_loss_mlp": 1.08593965, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 4.51222339252788, "language_loss": 0.88114727, "learning_rate": 2.554657600279796e-06, "loss": 0.920169, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.724830150604248 }, { "auxiliary_loss_clip": 0.02551436, "auxiliary_loss_mlp": 0.01237731, "balance_loss_clip": 1.46114039, "balance_loss_mlp": 1.07083786, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 2.1750515826516494, "language_loss": 1.03274202, "learning_rate": 2.5756575039679493e-06, "loss": 1.07063377, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.663585662841797 }, { "auxiliary_loss_clip": 0.0253613, "auxiliary_loss_mlp": 0.0126914, "balance_loss_clip": 1.45268869, "balance_loss_mlp": 1.09032583, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 1.8707606647659145, "language_loss": 0.95025724, "learning_rate": 2.5960657816942747e-06, "loss": 0.98830992, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.6781909465789795 }, { "auxiliary_loss_clip": 0.02335059, "auxiliary_loss_mlp": 0.01229429, "balance_loss_clip": 1.51892543, "balance_loss_mlp": 1.130247, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.4048744057290041, "language_loss": 0.60968435, "learning_rate": 2.6159148575788668e-06, "loss": 0.64532924, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.1554055213928223 }, { "auxiliary_loss_clip": 0.02453547, "auxiliary_loss_mlp": 0.01279358, "balance_loss_clip": 1.43425667, "balance_loss_mlp": 1.10483563, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.7718108300585858, "language_loss": 0.9864037, "learning_rate": 2.635234561171e-06, "loss": 1.02373278, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.666367769241333 }, { "auxiliary_loss_clip": 0.02431874, "auxiliary_loss_mlp": 0.01235588, "balance_loss_clip": 1.43749022, "balance_loss_mlp": 1.06640649, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 2.331339898382537, "language_loss": 0.9418813, "learning_rate": 2.6540523970949877e-06, "loss": 0.97855592, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.670243501663208 }, { "auxiliary_loss_clip": 0.02410703, "auxiliary_loss_mlp": 0.012417, "balance_loss_clip": 1.42974162, "balance_loss_mlp": 1.08014727, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 2.6696682227032653, "language_loss": 0.92517465, "learning_rate": 2.6723937805519533e-06, "loss": 0.96169871, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 3.509359359741211 }, { "auxiliary_loss_clip": 0.02401928, "auxiliary_loss_mlp": 0.01228995, "balance_loss_clip": 1.4225893, "balance_loss_mlp": 1.0744046, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 2.387773784094591, "language_loss": 0.92971754, "learning_rate": 2.690282243737839e-06, "loss": 0.96602672, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 3.4301517009735107 }, { "auxiliary_loss_clip": 0.02336054, "auxiliary_loss_mlp": 0.01241003, "balance_loss_clip": 1.40155196, "balance_loss_mlp": 1.08116686, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 3.1684386353437444, "language_loss": 0.99301088, "learning_rate": 2.7077396173840807e-06, "loss": 1.02878141, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.6879496574401855 }, { "auxiliary_loss_clip": 0.02335765, "auxiliary_loss_mlp": 0.01238816, "balance_loss_clip": 1.40957808, "balance_loss_mlp": 1.078408, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 3.1619378935936573, "language_loss": 0.92803317, "learning_rate": 2.7247861909342594e-06, "loss": 0.96377897, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 2.718348503112793 }, { "auxiliary_loss_clip": 0.02303502, "auxiliary_loss_mlp": 0.01216371, "balance_loss_clip": 1.40111566, "balance_loss_mlp": 1.06330585, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.3169774312669373, "language_loss": 0.83036232, "learning_rate": 2.7414408543044743e-06, "loss": 0.86556101, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.705641746520996 }, { "auxiliary_loss_clip": 0.02258889, "auxiliary_loss_mlp": 0.01210843, "balance_loss_clip": 1.38939393, "balance_loss_mlp": 1.05520368, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 4.235121524790522, "language_loss": 0.79133725, "learning_rate": 2.7577212237113157e-06, "loss": 0.82603467, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.614469051361084 }, { "auxiliary_loss_clip": 0.02244704, "auxiliary_loss_mlp": 0.01209598, "balance_loss_clip": 1.38951397, "balance_loss_mlp": 1.06416225, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 2.425211912821761, "language_loss": 1.0416863, "learning_rate": 2.7736437536690466e-06, "loss": 1.07622933, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.662057399749756 }, { "auxiliary_loss_clip": 0.0223589, "auxiliary_loss_mlp": 0.01215962, "balance_loss_clip": 1.3838706, "balance_loss_mlp": 1.07205296, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 2.001310035414953, "language_loss": 1.07784915, "learning_rate": 2.789223836941131e-06, "loss": 1.11236763, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.667766809463501 }, { "auxiliary_loss_clip": 0.02204592, "auxiliary_loss_mlp": 0.01219969, "balance_loss_clip": 1.37509871, "balance_loss_mlp": 1.07481992, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.3723719715788563, "language_loss": 1.0880394, "learning_rate": 2.8044758939680847e-06, "loss": 1.12228489, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.6212713718414307 }, { "auxiliary_loss_clip": 0.02167197, "auxiliary_loss_mlp": 0.01215579, "balance_loss_clip": 1.37099004, "balance_loss_mlp": 1.08072948, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 3.2283571462991545, "language_loss": 1.0180223, "learning_rate": 2.8194134530738863e-06, "loss": 1.05185008, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.643406867980957 }, { "auxiliary_loss_clip": 0.02140832, "auxiliary_loss_mlp": 0.01212232, "balance_loss_clip": 1.36449337, "balance_loss_mlp": 1.08014846, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 4.250806309686046, "language_loss": 0.90274978, "learning_rate": 2.834049222568994e-06, "loss": 0.93628043, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.6331348419189453 }, { "auxiliary_loss_clip": 0.02149088, "auxiliary_loss_mlp": 0.01210894, "balance_loss_clip": 1.36088276, "balance_loss_mlp": 1.07690275, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 3.3185420554971565, "language_loss": 0.92510939, "learning_rate": 2.848395155712969e-06, "loss": 0.95870924, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.623654842376709 }, { "auxiliary_loss_clip": 0.02119848, "auxiliary_loss_mlp": 0.01211518, "balance_loss_clip": 1.35791373, "balance_loss_mlp": 1.08105576, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.3549538104501275, "language_loss": 0.97784567, "learning_rate": 2.8624625093687977e-06, "loss": 1.0111593, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.6782426834106445 }, { "auxiliary_loss_clip": 0.02119765, "auxiliary_loss_mlp": 0.01209272, "balance_loss_clip": 1.35756218, "balance_loss_mlp": 1.08043051, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 2.7979322053929705, "language_loss": 0.89082581, "learning_rate": 2.876261897070029e-06, "loss": 0.92411613, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.6332337856292725 }, { "auxiliary_loss_clip": 0.0210693, "auxiliary_loss_mlp": 0.01196524, "balance_loss_clip": 1.35462236, "balance_loss_mlp": 1.06873155, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 2.340559889093807, "language_loss": 0.92575788, "learning_rate": 2.889803337127447e-06, "loss": 0.95879239, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.6453781127929688 }, { "auxiliary_loss_clip": 0.02075212, "auxiliary_loss_mlp": 0.01217384, "balance_loss_clip": 1.34608614, "balance_loss_mlp": 1.09254766, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 2.499119240416735, "language_loss": 0.84730953, "learning_rate": 2.903096296321516e-06, "loss": 0.88023549, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.6286933422088623 }, { "auxiliary_loss_clip": 0.02053994, "auxiliary_loss_mlp": 0.01198798, "balance_loss_clip": 1.34349608, "balance_loss_mlp": 1.07653666, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 1.9640366710844803, "language_loss": 0.91531092, "learning_rate": 2.9161497296578907e-06, "loss": 0.94783884, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.703251361846924 }, { "auxiliary_loss_clip": 0.02052004, "auxiliary_loss_mlp": 0.01199723, "balance_loss_clip": 1.33968043, "balance_loss_mlp": 1.08156323, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 2.9955183099440696, "language_loss": 0.85976094, "learning_rate": 2.928972116604173e-06, "loss": 0.89227831, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.6440587043762207 }, { "auxiliary_loss_clip": 0.02027551, "auxiliary_loss_mlp": 0.01184574, "balance_loss_clip": 1.32832968, "balance_loss_mlp": 1.06555593, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 2.281978007295325, "language_loss": 1.02157617, "learning_rate": 2.9415714941751377e-06, "loss": 1.05369735, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.663088798522949 }, { "auxiliary_loss_clip": 0.02022483, "auxiliary_loss_mlp": 0.01205806, "balance_loss_clip": 1.32719791, "balance_loss_mlp": 1.0892669, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 1.9678687188826345, "language_loss": 0.93580437, "learning_rate": 2.9539554871897396e-06, "loss": 0.96808732, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.649562358856201 }, { "auxiliary_loss_clip": 0.01990422, "auxiliary_loss_mlp": 0.01195514, "balance_loss_clip": 1.32149422, "balance_loss_mlp": 1.08565044, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 2.049598517044452, "language_loss": 0.97361022, "learning_rate": 2.9661313359851253e-06, "loss": 1.00546956, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.6267647743225098 }, { "auxiliary_loss_clip": 0.0197324, "auxiliary_loss_mlp": 0.01185337, "balance_loss_clip": 1.3135643, "balance_loss_mlp": 1.07966971, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 4.5420674050303935, "language_loss": 0.94012964, "learning_rate": 2.978105921839922e-06, "loss": 0.97171545, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.6749684810638428 }, { "auxiliary_loss_clip": 0.01959505, "auxiliary_loss_mlp": 0.01166968, "balance_loss_clip": 1.31642532, "balance_loss_mlp": 1.06420946, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 2.4023068207806926, "language_loss": 0.72311914, "learning_rate": 2.9898857903302893e-06, "loss": 0.7543838, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.6423768997192383 }, { "auxiliary_loss_clip": 0.01947438, "auxiliary_loss_mlp": 0.01184091, "balance_loss_clip": 1.30920172, "balance_loss_mlp": 1.07761288, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 3.027365462666879, "language_loss": 0.87883735, "learning_rate": 3.001477172817253e-06, "loss": 0.91015261, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.617429256439209 }, { "auxiliary_loss_clip": 0.01930127, "auxiliary_loss_mlp": 0.01190527, "balance_loss_clip": 1.30227172, "balance_loss_mlp": 1.08342946, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 2.2528603802102496, "language_loss": 0.96358716, "learning_rate": 3.012886006241894e-06, "loss": 0.99479365, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.6402382850646973 }, { "auxiliary_loss_clip": 0.01932982, "auxiliary_loss_mlp": 0.01179333, "balance_loss_clip": 1.30915737, "balance_loss_mlp": 1.07833874, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 1.868596309629138, "language_loss": 0.88130593, "learning_rate": 3.0241179513858383e-06, "loss": 0.91242909, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.6497275829315186 }, { "auxiliary_loss_clip": 0.01906888, "auxiliary_loss_mlp": 0.01151698, "balance_loss_clip": 1.29048944, "balance_loss_mlp": 1.05232513, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 5.766846321945436, "language_loss": 0.87761009, "learning_rate": 3.035178409737647e-06, "loss": 0.90819597, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 3.536262035369873 }, { "auxiliary_loss_clip": 0.0189392, "auxiliary_loss_mlp": 0.0117598, "balance_loss_clip": 1.28699124, "balance_loss_mlp": 1.07474685, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 2.5265376964167174, "language_loss": 0.88755441, "learning_rate": 3.046072539090907e-06, "loss": 0.91825342, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 4.280246734619141 }, { "auxiliary_loss_clip": 0.01868238, "auxiliary_loss_mlp": 0.0116643, "balance_loss_clip": 1.28048182, "balance_loss_mlp": 1.06705761, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 2.2665359022515696, "language_loss": 1.04749846, "learning_rate": 3.056805267986779e-06, "loss": 1.07784522, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 2.6140341758728027 }, { "auxiliary_loss_clip": 0.01861962, "auxiliary_loss_mlp": 0.01169469, "balance_loss_clip": 1.27846122, "balance_loss_mlp": 1.07228994, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 2.8743211876702204, "language_loss": 0.95459169, "learning_rate": 3.0673813091022194e-06, "loss": 0.98490602, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.6104815006256104 }, { "auxiliary_loss_clip": 0.01696683, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.30511189, "balance_loss_mlp": 1.0148344, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.3078048472863455, "language_loss": 0.62139696, "learning_rate": 3.0778051716749317e-06, "loss": 0.64888978, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.251384973526001 }, { "auxiliary_loss_clip": 0.0182163, "auxiliary_loss_mlp": 0.0115885, "balance_loss_clip": 1.25820971, "balance_loss_mlp": 1.06395936, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 3.6799912470802356, "language_loss": 0.9033252, "learning_rate": 3.0880811730470094e-06, "loss": 0.93313003, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.606001377105713 }, { "auxiliary_loss_clip": 0.0166769, "auxiliary_loss_mlp": 0.01042049, "balance_loss_clip": 1.29177213, "balance_loss_mlp": 1.00695348, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.1979577701255546, "language_loss": 0.58597147, "learning_rate": 3.098213449401257e-06, "loss": 0.61306882, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.138117790222168 }, { "auxiliary_loss_clip": 0.01803025, "auxiliary_loss_mlp": 0.01165256, "balance_loss_clip": 1.25482249, "balance_loss_mlp": 1.07398975, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 2.4667256319691373, "language_loss": 0.98773623, "learning_rate": 3.1082059657570015e-06, "loss": 1.0174191, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.762204170227051 }, { "auxiliary_loss_clip": 0.01798488, "auxiliary_loss_mlp": 0.0117941, "balance_loss_clip": 1.25453186, "balance_loss_mlp": 1.08962166, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 2.2211573237492193, "language_loss": 0.96656609, "learning_rate": 3.1180625252858496e-06, "loss": 0.99634498, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.7041971683502197 }, { "auxiliary_loss_clip": 0.01774675, "auxiliary_loss_mlp": 0.01145069, "balance_loss_clip": 1.24564385, "balance_loss_mlp": 1.06815553, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 2.857009616960915, "language_loss": 0.80012983, "learning_rate": 3.1277867780021663e-06, "loss": 0.82932729, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.6951398849487305 }, { "auxiliary_loss_clip": 0.01756258, "auxiliary_loss_mlp": 0.01147956, "balance_loss_clip": 1.23863006, "balance_loss_mlp": 1.06908679, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 2.2759806739176773, "language_loss": 0.95767343, "learning_rate": 3.1373822288779824e-06, "loss": 0.98671556, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.620680570602417 }, { "auxiliary_loss_clip": 0.01755157, "auxiliary_loss_mlp": 0.01163957, "balance_loss_clip": 1.24133897, "balance_loss_mlp": 1.08804488, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 2.453429217808873, "language_loss": 0.79604065, "learning_rate": 3.1468522454274533e-06, "loss": 0.82523179, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.6683523654937744 }, { "auxiliary_loss_clip": 0.01735096, "auxiliary_loss_mlp": 0.01148421, "balance_loss_clip": 1.23238397, "balance_loss_mlp": 1.07236588, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 2.2516331397481473, "language_loss": 0.91843367, "learning_rate": 3.15620006480197e-06, "loss": 0.94726884, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.628598690032959 }, { "auxiliary_loss_clip": 0.01722777, "auxiliary_loss_mlp": 0.01152976, "balance_loss_clip": 1.22714949, "balance_loss_mlp": 1.07844615, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 3.44995959619227, "language_loss": 0.75064266, "learning_rate": 3.1654288004333087e-06, "loss": 0.77940023, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.735701322555542 }, { "auxiliary_loss_clip": 0.01722334, "auxiliary_loss_mlp": 0.01130356, "balance_loss_clip": 1.22725534, "balance_loss_mlp": 1.05692279, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 2.473543331226019, "language_loss": 0.76173967, "learning_rate": 3.1745414482589353e-06, "loss": 0.79026651, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.596764087677002 }, { "auxiliary_loss_clip": 0.01707214, "auxiliary_loss_mlp": 0.01136657, "balance_loss_clip": 1.22245479, "balance_loss_mlp": 1.06570315, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 52.57095678243634, "language_loss": 0.87097412, "learning_rate": 3.1835408925606204e-06, "loss": 0.89941281, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.6182422637939453 }, { "auxiliary_loss_clip": 0.01697828, "auxiliary_loss_mlp": 0.01130204, "balance_loss_clip": 1.22360909, "balance_loss_mlp": 1.06244516, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 2.449771706006641, "language_loss": 0.89386612, "learning_rate": 3.1924299114448214e-06, "loss": 0.92214644, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.659130573272705 }, { "auxiliary_loss_clip": 0.01694038, "auxiliary_loss_mlp": 0.01148796, "balance_loss_clip": 1.22047734, "balance_loss_mlp": 1.07994056, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 2.6534820905003613, "language_loss": 0.83321601, "learning_rate": 3.2012111819909055e-06, "loss": 0.86164439, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.6104140281677246 }, { "auxiliary_loss_clip": 0.01679679, "auxiliary_loss_mlp": 0.01125962, "balance_loss_clip": 1.21525764, "balance_loss_mlp": 1.06087339, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 2.4150544968692365, "language_loss": 0.95006216, "learning_rate": 3.2098872850910627e-06, "loss": 0.97811848, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.612663984298706 }, { "auxiliary_loss_clip": 0.01682143, "auxiliary_loss_mlp": 0.0112846, "balance_loss_clip": 1.21499741, "balance_loss_mlp": 1.05860376, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 3.3081700609859275, "language_loss": 0.8918491, "learning_rate": 3.2184607100038194e-06, "loss": 0.91995513, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.61582350730896 }, { "auxiliary_loss_clip": 0.01683408, "auxiliary_loss_mlp": 0.01138015, "balance_loss_clip": 1.21688366, "balance_loss_mlp": 1.07292712, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 2.0090636295607385, "language_loss": 0.93072778, "learning_rate": 3.2269338586412414e-06, "loss": 0.95894206, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.6697964668273926 }, { "auxiliary_loss_clip": 0.01666922, "auxiliary_loss_mlp": 0.01127021, "balance_loss_clip": 1.21045959, "balance_loss_mlp": 1.06736875, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 3.2065077979377437, "language_loss": 0.96568251, "learning_rate": 3.2353090496083106e-06, "loss": 0.99362195, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.5955700874328613 }, { "auxiliary_loss_clip": 0.01652881, "auxiliary_loss_mlp": 0.01133292, "balance_loss_clip": 1.20469308, "balance_loss_mlp": 1.06910944, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 1.969711958388999, "language_loss": 0.81489229, "learning_rate": 3.2435885220114572e-06, "loss": 0.84275401, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 2.7678210735321045 }, { "auxiliary_loss_clip": 0.01654163, "auxiliary_loss_mlp": 0.01136926, "balance_loss_clip": 1.20783734, "balance_loss_mlp": 1.07651079, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 2.050978301637826, "language_loss": 0.93889338, "learning_rate": 3.2517744390519113e-06, "loss": 0.96680427, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.575794219970703 }, { "auxiliary_loss_clip": 0.01641301, "auxiliary_loss_mlp": 0.0111392, "balance_loss_clip": 1.19831264, "balance_loss_mlp": 1.05851138, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 1.9197973314752943, "language_loss": 0.75333911, "learning_rate": 3.259868891418298e-06, "loss": 0.7808913, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.6197564601898193 }, { "auxiliary_loss_clip": 0.01647607, "auxiliary_loss_mlp": 0.01117331, "balance_loss_clip": 1.20769215, "balance_loss_mlp": 1.06182742, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 1.9585524923089532, "language_loss": 0.85209805, "learning_rate": 3.2678739004917757e-06, "loss": 0.87974739, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.6192240715026855 }, { "auxiliary_loss_clip": 0.01627726, "auxiliary_loss_mlp": 0.01116929, "balance_loss_clip": 1.20025921, "balance_loss_mlp": 1.0605197, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 1.753230572550286, "language_loss": 0.9222917, "learning_rate": 3.275791421376029e-06, "loss": 0.94973826, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.434644937515259 }, { "auxiliary_loss_clip": 0.01629189, "auxiliary_loss_mlp": 0.01122177, "balance_loss_clip": 1.19942498, "balance_loss_mlp": 1.06433678, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 2.520805703205127, "language_loss": 0.95918185, "learning_rate": 3.2836233457634622e-06, "loss": 0.98669553, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 4.795124292373657 }, { "auxiliary_loss_clip": 0.01624645, "auxiliary_loss_mlp": 0.01120945, "balance_loss_clip": 1.19853473, "balance_loss_mlp": 1.06663322, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 1.950314419045031, "language_loss": 0.85309523, "learning_rate": 3.2913715046481135e-06, "loss": 0.8805511, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 2.612501859664917 }, { "auxiliary_loss_clip": 0.01612217, "auxiliary_loss_mlp": 0.01105653, "balance_loss_clip": 1.19238138, "balance_loss_mlp": 1.04976797, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 2.0115840429806724, "language_loss": 0.89022559, "learning_rate": 3.299037670895023e-06, "loss": 0.91740429, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.581861734390259 }, { "auxiliary_loss_clip": 0.0162201, "auxiliary_loss_mlp": 0.01109103, "balance_loss_clip": 1.20257556, "balance_loss_mlp": 1.06053698, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 2.335774190296398, "language_loss": 0.80262685, "learning_rate": 3.3066235616750667e-06, "loss": 0.82993793, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.6745665073394775 }, { "auxiliary_loss_clip": 0.015998, "auxiliary_loss_mlp": 0.01111173, "balance_loss_clip": 1.18832874, "balance_loss_mlp": 1.06303644, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 2.1509186643232474, "language_loss": 0.92303348, "learning_rate": 3.3141308407736276e-06, "loss": 0.95014316, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.577507972717285 }, { "auxiliary_loss_clip": 0.01594088, "auxiliary_loss_mlp": 0.01110039, "balance_loss_clip": 1.18061233, "balance_loss_mlp": 1.05916059, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 1.9118960346543807, "language_loss": 0.8669939, "learning_rate": 3.321561120780869e-06, "loss": 0.89403516, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.5909221172332764 }, { "auxiliary_loss_clip": 0.01595471, "auxiliary_loss_mlp": 0.0111065, "balance_loss_clip": 1.19286859, "balance_loss_mlp": 1.06494498, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 2.695088146677972, "language_loss": 1.01540196, "learning_rate": 3.3289159651708192e-06, "loss": 1.04246306, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.5690414905548096 }, { "auxiliary_loss_clip": 0.01589214, "auxiliary_loss_mlp": 0.01108249, "balance_loss_clip": 1.18928754, "balance_loss_mlp": 1.06225824, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 2.0058858725524455, "language_loss": 0.97671264, "learning_rate": 3.3361968902759768e-06, "loss": 1.00368738, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.5943081378936768 }, { "auxiliary_loss_clip": 0.01577268, "auxiliary_loss_mlp": 0.01109525, "balance_loss_clip": 1.18775868, "balance_loss_mlp": 1.06436861, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 2.1074223861283157, "language_loss": 0.93874091, "learning_rate": 3.343405367163663e-06, "loss": 0.96560884, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.6027944087982178 }, { "auxiliary_loss_clip": 0.01580876, "auxiliary_loss_mlp": 0.01109176, "balance_loss_clip": 1.18550253, "balance_loss_mlp": 1.06618881, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 2.267818428956463, "language_loss": 0.81062484, "learning_rate": 3.350542823419951e-06, "loss": 0.83752531, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.576003313064575 }, { "auxiliary_loss_clip": 0.01575112, "auxiliary_loss_mlp": 0.01106025, "balance_loss_clip": 1.18440056, "balance_loss_mlp": 1.05943799, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 4.169865642013476, "language_loss": 0.87617517, "learning_rate": 3.3576106448465615e-06, "loss": 0.90298653, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.578104019165039 }, { "auxiliary_loss_clip": 0.01570133, "auxiliary_loss_mlp": 0.01122099, "balance_loss_clip": 1.17977023, "balance_loss_mlp": 1.07634616, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 2.4388233641723254, "language_loss": 0.8803333, "learning_rate": 3.3646101770757797e-06, "loss": 0.90725553, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.667443037033081 }, { "auxiliary_loss_clip": 0.01568706, "auxiliary_loss_mlp": 0.01095717, "balance_loss_clip": 1.18405902, "balance_loss_mlp": 1.05344534, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 2.1792645868865708, "language_loss": 0.85637951, "learning_rate": 3.371542727108104e-06, "loss": 0.88302374, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 2.7457308769226074 }, { "auxiliary_loss_clip": 0.01568199, "auxiliary_loss_mlp": 0.01095988, "balance_loss_clip": 1.18159127, "balance_loss_mlp": 1.05507517, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 3.3076253332945047, "language_loss": 0.90158856, "learning_rate": 3.3784095647770114e-06, "loss": 0.92823046, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.6095962524414062 }, { "auxiliary_loss_clip": 0.01556989, "auxiliary_loss_mlp": 0.0111149, "balance_loss_clip": 1.17649579, "balance_loss_mlp": 1.06824064, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 3.4918115954453315, "language_loss": 0.88824439, "learning_rate": 3.3852119241449547e-06, "loss": 0.91492915, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.5979321002960205 }, { "auxiliary_loss_clip": 0.01554947, "auxiliary_loss_mlp": 0.01105896, "balance_loss_clip": 1.17780042, "balance_loss_mlp": 1.06631827, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 2.2252672285471156, "language_loss": 0.96202177, "learning_rate": 3.3919510048344295e-06, "loss": 0.98863024, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.627350330352783 }, { "auxiliary_loss_clip": 0.0154982, "auxiliary_loss_mlp": 0.01099091, "balance_loss_clip": 1.17764211, "balance_loss_mlp": 1.06256485, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 2.342722546609425, "language_loss": 0.86791027, "learning_rate": 3.3986279732976907e-06, "loss": 0.8943994, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.6053249835968018 }, { "auxiliary_loss_clip": 0.01547528, "auxiliary_loss_mlp": 0.01086079, "balance_loss_clip": 1.17781806, "balance_loss_mlp": 1.05007803, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 2.207749537151733, "language_loss": 0.95349258, "learning_rate": 3.4052439640284983e-06, "loss": 0.9798286, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.6069517135620117 }, { "auxiliary_loss_clip": 0.01538109, "auxiliary_loss_mlp": 0.01096616, "balance_loss_clip": 1.17132282, "balance_loss_mlp": 1.05973268, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.7341826275082421, "language_loss": 0.81045955, "learning_rate": 3.4118000807190217e-06, "loss": 0.83680677, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.596102476119995 }, { "auxiliary_loss_clip": 0.01541809, "auxiliary_loss_mlp": 0.01095109, "balance_loss_clip": 1.17327666, "balance_loss_mlp": 1.05875039, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 2.191465930464939, "language_loss": 0.76036549, "learning_rate": 3.4182973973648723e-06, "loss": 0.78673464, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.653881072998047 }, { "auxiliary_loss_clip": 0.0153594, "auxiliary_loss_mlp": 0.01094859, "balance_loss_clip": 1.1702354, "balance_loss_mlp": 1.05690265, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 2.670462361131529, "language_loss": 0.95204437, "learning_rate": 3.424736959321014e-06, "loss": 0.97835237, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.543848991394043 }, { "auxiliary_loss_clip": 0.01532884, "auxiliary_loss_mlp": 0.01093502, "balance_loss_clip": 1.17091274, "balance_loss_mlp": 1.05871677, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 2.1531393542132413, "language_loss": 0.88732773, "learning_rate": 3.431119784311155e-06, "loss": 0.91359162, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.6215999126434326 }, { "auxiliary_loss_clip": 0.01528723, "auxiliary_loss_mlp": 0.01112311, "balance_loss_clip": 1.16879427, "balance_loss_mlp": 1.0796473, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 3.667384507034687, "language_loss": 0.77449298, "learning_rate": 3.43744686339307e-06, "loss": 0.80090332, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.7326204776763916 }, { "auxiliary_loss_clip": 0.01527806, "auxiliary_loss_mlp": 0.01105652, "balance_loss_clip": 1.16489601, "balance_loss_mlp": 1.06841052, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.1083415486986317, "language_loss": 0.91046405, "learning_rate": 3.44371916188212e-06, "loss": 0.93679863, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 2.753671407699585 }, { "auxiliary_loss_clip": 0.0152828, "auxiliary_loss_mlp": 0.01094853, "balance_loss_clip": 1.16817081, "balance_loss_mlp": 1.06254745, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 2.331354152164771, "language_loss": 0.86081684, "learning_rate": 3.449937620235143e-06, "loss": 0.88704818, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 3.39797043800354 }, { "auxiliary_loss_clip": 0.01521551, "auxiliary_loss_mlp": 0.01103766, "balance_loss_clip": 1.16536784, "balance_loss_mlp": 1.07291484, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 1.7247211910075553, "language_loss": 0.89481235, "learning_rate": 3.456103154896722e-06, "loss": 0.92106551, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 2.6332106590270996 }, { "auxiliary_loss_clip": 0.01511365, "auxiliary_loss_mlp": 0.01089684, "balance_loss_clip": 1.16303468, "balance_loss_mlp": 1.05683017, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 2.3849491942730485, "language_loss": 0.92460579, "learning_rate": 3.462216659109757e-06, "loss": 0.95061624, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 4.125097990036011 }, { "auxiliary_loss_clip": 0.01516607, "auxiliary_loss_mlp": 0.01093069, "balance_loss_clip": 1.1648972, "balance_loss_mlp": 1.06209815, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 2.27335723178321, "language_loss": 0.85126913, "learning_rate": 3.4682790036921077e-06, "loss": 0.87736595, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 2.5914762020111084 }, { "auxiliary_loss_clip": 0.01509401, "auxiliary_loss_mlp": 0.01086409, "balance_loss_clip": 1.16299069, "balance_loss_mlp": 1.05704761, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 2.0301343346690324, "language_loss": 0.83341813, "learning_rate": 3.4742910377810193e-06, "loss": 0.85937619, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.587437391281128 }, { "auxiliary_loss_clip": 0.01511513, "auxiliary_loss_mlp": 0.01090445, "balance_loss_clip": 1.16439426, "balance_loss_mlp": 1.0614531, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 6.26819121268211, "language_loss": 0.88623309, "learning_rate": 3.4802535895469042e-06, "loss": 0.91225272, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.537217617034912 }, { "auxiliary_loss_clip": 0.01508156, "auxiliary_loss_mlp": 0.01084531, "balance_loss_clip": 1.16541624, "balance_loss_mlp": 1.05663586, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 1.9301833841520135, "language_loss": 0.89698803, "learning_rate": 3.4861674668779934e-06, "loss": 0.92291492, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.587790012359619 }, { "auxiliary_loss_clip": 0.01501833, "auxiliary_loss_mlp": 0.01086957, "balance_loss_clip": 1.15893316, "balance_loss_mlp": 1.05826378, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 1.8233780493803151, "language_loss": 0.84132445, "learning_rate": 3.492033458037272e-06, "loss": 0.8672123, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 2.5837199687957764 }, { "auxiliary_loss_clip": 0.01499895, "auxiliary_loss_mlp": 0.01082574, "balance_loss_clip": 1.15823507, "balance_loss_mlp": 1.05455995, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 3.05631289621366, "language_loss": 0.86898708, "learning_rate": 3.497852332293018e-06, "loss": 0.89481175, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.5391080379486084 }, { "auxiliary_loss_clip": 0.01504806, "auxiliary_loss_mlp": 0.01079669, "balance_loss_clip": 1.16009521, "balance_loss_mlp": 1.05115438, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 4.610784062353718, "language_loss": 0.96682847, "learning_rate": 3.5036248405242356e-06, "loss": 0.99267322, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.545975685119629 }, { "auxiliary_loss_clip": 0.01499331, "auxiliary_loss_mlp": 0.01092533, "balance_loss_clip": 1.15824389, "balance_loss_mlp": 1.06464982, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 2.0763385236895, "language_loss": 0.82519841, "learning_rate": 3.509351715802146e-06, "loss": 0.85111707, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.727416515350342 }, { "auxiliary_loss_clip": 0.01495719, "auxiliary_loss_mlp": 0.01080554, "balance_loss_clip": 1.15637851, "balance_loss_mlp": 1.05389857, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 2.205606749855949, "language_loss": 0.78285837, "learning_rate": 3.5150336739488763e-06, "loss": 0.80862111, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 2.779639482498169 }, { "auxiliary_loss_clip": 0.01489636, "auxiliary_loss_mlp": 0.01070541, "balance_loss_clip": 1.15665209, "balance_loss_mlp": 1.04666317, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 1.9280132084612605, "language_loss": 0.84154439, "learning_rate": 3.5206714140744143e-06, "loss": 0.86714619, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.5853497982025146 }, { "auxiliary_loss_clip": 0.01496167, "auxiliary_loss_mlp": 0.01108898, "balance_loss_clip": 1.16463065, "balance_loss_mlp": 1.08212316, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 2.378796961498555, "language_loss": 0.87463653, "learning_rate": 3.5262656190928208e-06, "loss": 0.90068722, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.602620840072632 }, { "auxiliary_loss_clip": 0.01476459, "auxiliary_loss_mlp": 0.01066746, "balance_loss_clip": 1.23059487, "balance_loss_mlp": 1.05830622, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.076836062950782, "language_loss": 0.71515107, "learning_rate": 3.5318169562186737e-06, "loss": 0.74058312, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.1902523040771484 }, { "auxiliary_loss_clip": 0.01488477, "auxiliary_loss_mlp": 0.01082521, "balance_loss_clip": 1.15625262, "balance_loss_mlp": 1.05627131, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 2.762149221162883, "language_loss": 0.82204282, "learning_rate": 3.5373260774446292e-06, "loss": 0.84775281, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.5911712646484375 }, { "auxiliary_loss_clip": 0.01479214, "auxiliary_loss_mlp": 0.010884, "balance_loss_clip": 1.14925098, "balance_loss_mlp": 1.06374753, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 1.9345753706105984, "language_loss": 0.90561771, "learning_rate": 3.542793620000961e-06, "loss": 0.93129385, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.5576682090759277 }, { "auxiliary_loss_clip": 0.01480723, "auxiliary_loss_mlp": 0.01073027, "balance_loss_clip": 1.15646648, "balance_loss_mlp": 1.05099702, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 2.733308464703463, "language_loss": 0.86870593, "learning_rate": 3.5482202067978894e-06, "loss": 0.89424342, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.548170566558838 }, { "auxiliary_loss_clip": 0.01486711, "auxiliary_loss_mlp": 0.01078825, "balance_loss_clip": 1.15762949, "balance_loss_mlp": 1.05436289, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 1.927889197472661, "language_loss": 0.75770152, "learning_rate": 3.553606446851471e-06, "loss": 0.78335679, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.5574491024017334 }, { "auxiliary_loss_clip": 0.01472967, "auxiliary_loss_mlp": 0.01080618, "balance_loss_clip": 1.15282369, "balance_loss_mlp": 1.05831432, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 1.799429396959038, "language_loss": 0.83424121, "learning_rate": 3.5589529356937613e-06, "loss": 0.85977703, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.577397346496582 }, { "auxiliary_loss_clip": 0.0147336, "auxiliary_loss_mlp": 0.01088117, "balance_loss_clip": 1.15184355, "balance_loss_mlp": 1.06407273, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 1.7127651406520321, "language_loss": 0.76989675, "learning_rate": 3.5642602557679627e-06, "loss": 0.79551148, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.5387141704559326 }, { "auxiliary_loss_clip": 0.01484932, "auxiliary_loss_mlp": 0.0108215, "balance_loss_clip": 1.16264665, "balance_loss_mlp": 1.06143165, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 2.2864913619152, "language_loss": 0.8439154, "learning_rate": 3.569528976809202e-06, "loss": 0.86958623, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.602400779724121 }, { "auxiliary_loss_clip": 0.01470971, "auxiliary_loss_mlp": 0.01077691, "balance_loss_clip": 1.14954877, "balance_loss_mlp": 1.05433774, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 1.7369319450492473, "language_loss": 0.89912593, "learning_rate": 3.5747596562115522e-06, "loss": 0.92461246, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.567641019821167 }, { "auxiliary_loss_clip": 0.0146965, "auxiliary_loss_mlp": 0.01073377, "balance_loss_clip": 1.14831567, "balance_loss_mlp": 1.04865289, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 2.6647306671690867, "language_loss": 0.90891987, "learning_rate": 3.5799528393819138e-06, "loss": 0.93435013, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.5600266456604004 }, { "auxiliary_loss_clip": 0.01463805, "auxiliary_loss_mlp": 0.01083064, "balance_loss_clip": 1.14604521, "balance_loss_mlp": 1.05904377, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 1.830327226729161, "language_loss": 0.87970334, "learning_rate": 3.585109060081286e-06, "loss": 0.90517199, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.5528087615966797 }, { "auxiliary_loss_clip": 0.01468021, "auxiliary_loss_mlp": 0.01083845, "balance_loss_clip": 1.14799333, "balance_loss_mlp": 1.06081367, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 4.323909885520045, "language_loss": 0.78534317, "learning_rate": 3.590228840753992e-06, "loss": 0.81086183, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.6188130378723145 }, { "auxiliary_loss_clip": 0.01465754, "auxiliary_loss_mlp": 0.01074089, "balance_loss_clip": 1.1535064, "balance_loss_mlp": 1.05198812, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 2.2018648885133576, "language_loss": 0.87477881, "learning_rate": 3.5953126928453423e-06, "loss": 0.90017724, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.5492570400238037 }, { "auxiliary_loss_clip": 0.01456476, "auxiliary_loss_mlp": 0.01075666, "balance_loss_clip": 1.1464963, "balance_loss_mlp": 1.05387425, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 1.9994867971396193, "language_loss": 0.80514395, "learning_rate": 3.600361117108239e-06, "loss": 0.83046538, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 3.3276917934417725 }, { "auxiliary_loss_clip": 0.01458583, "auxiliary_loss_mlp": 0.01068948, "balance_loss_clip": 1.14614332, "balance_loss_mlp": 1.04710889, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 1.8683916270628007, "language_loss": 0.97108692, "learning_rate": 3.6053746038991616e-06, "loss": 0.99636221, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 4.07668662071228 }, { "auxiliary_loss_clip": 0.01417444, "auxiliary_loss_mlp": 0.01055835, "balance_loss_clip": 1.19005215, "balance_loss_mlp": 1.04787219, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0745818248686478, "language_loss": 0.58418965, "learning_rate": 3.6103536334639843e-06, "loss": 0.60892248, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 3.801753044128418 }, { "auxiliary_loss_clip": 0.01450057, "auxiliary_loss_mlp": 0.01069313, "balance_loss_clip": 1.14471602, "balance_loss_mlp": 1.05012, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 2.9486675317738795, "language_loss": 0.85580921, "learning_rate": 3.615298676214041e-06, "loss": 0.8810029, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 2.574413299560547 }, { "auxiliary_loss_clip": 0.01450675, "auxiliary_loss_mlp": 0.01078457, "balance_loss_clip": 1.13923168, "balance_loss_mlp": 1.05808425, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.112017758271063, "language_loss": 0.88778573, "learning_rate": 3.6202101929928317e-06, "loss": 0.91307712, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.5598063468933105 }, { "auxiliary_loss_clip": 0.01447156, "auxiliary_loss_mlp": 0.01080671, "balance_loss_clip": 1.1403861, "balance_loss_mlp": 1.06025004, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 1.8918304167550135, "language_loss": 0.88464844, "learning_rate": 3.6250886353337413e-06, "loss": 0.90992671, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.5236122608184814 }, { "auxiliary_loss_clip": 0.01449003, "auxiliary_loss_mlp": 0.01080387, "balance_loss_clip": 1.14243829, "balance_loss_mlp": 1.06026459, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 1.8209729436042834, "language_loss": 0.86479127, "learning_rate": 3.6299344457091488e-06, "loss": 0.89008522, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.5801820755004883 }, { "auxiliary_loss_clip": 0.01445774, "auxiliary_loss_mlp": 0.01068364, "balance_loss_clip": 1.14245319, "balance_loss_mlp": 1.04831314, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 2.2351663049974424, "language_loss": 0.93964016, "learning_rate": 3.634748057771256e-06, "loss": 0.96478152, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.539951801300049 }, { "auxiliary_loss_clip": 0.01435242, "auxiliary_loss_mlp": 0.01074169, "balance_loss_clip": 1.13546216, "balance_loss_mlp": 1.0544163, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 1.6348035898079096, "language_loss": 0.85596395, "learning_rate": 3.639529896584965e-06, "loss": 0.88105798, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.613234519958496 }, { "auxiliary_loss_clip": 0.01443738, "auxiliary_loss_mlp": 0.01067455, "balance_loss_clip": 1.14062595, "balance_loss_mlp": 1.04634309, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 2.606444504678349, "language_loss": 0.88838601, "learning_rate": 3.6442803788531233e-06, "loss": 0.91349792, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.563673734664917 }, { "auxiliary_loss_clip": 0.01445411, "auxiliary_loss_mlp": 0.01078054, "balance_loss_clip": 1.1394124, "balance_loss_mlp": 1.05768085, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 2.060910793302648, "language_loss": 0.96093982, "learning_rate": 3.6489999131344357e-06, "loss": 0.98617446, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.63460373878479 }, { "auxiliary_loss_clip": 0.01438094, "auxiliary_loss_mlp": 0.01067942, "balance_loss_clip": 1.14096057, "balance_loss_mlp": 1.05000114, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 2.122365780209798, "language_loss": 0.90655541, "learning_rate": 3.653688900054313e-06, "loss": 0.93161583, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.568570613861084 }, { "auxiliary_loss_clip": 0.01433327, "auxiliary_loss_mlp": 0.01071085, "balance_loss_clip": 1.13061881, "balance_loss_mlp": 1.05038989, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 2.2629009078284237, "language_loss": 0.76199782, "learning_rate": 3.6583477325089526e-06, "loss": 0.78704196, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.62390398979187 }, { "auxiliary_loss_clip": 0.0143138, "auxiliary_loss_mlp": 0.01070991, "balance_loss_clip": 1.13526893, "balance_loss_mlp": 1.05366945, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.3531410029512436, "language_loss": 1.04149842, "learning_rate": 3.6629767958628916e-06, "loss": 1.066522, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.6013338565826416 }, { "auxiliary_loss_clip": 0.01432496, "auxiliary_loss_mlp": 0.01069986, "balance_loss_clip": 1.13698876, "balance_loss_mlp": 1.04974377, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 2.637442910487536, "language_loss": 0.85747558, "learning_rate": 3.667576468140291e-06, "loss": 0.88250035, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.626528024673462 }, { "auxiliary_loss_clip": 0.01424745, "auxiliary_loss_mlp": 0.01067383, "balance_loss_clip": 1.12962055, "balance_loss_mlp": 1.04751086, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 2.4630778475537918, "language_loss": 0.88749409, "learning_rate": 3.672147120210184e-06, "loss": 0.91241533, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.6341779232025146 }, { "auxiliary_loss_clip": 0.01428859, "auxiliary_loss_mlp": 0.01075031, "balance_loss_clip": 1.13594997, "balance_loss_mlp": 1.05729234, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 1.797663650971881, "language_loss": 0.8628124, "learning_rate": 3.6766891159659177e-06, "loss": 0.88785136, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.596874475479126 }, { "auxiliary_loss_clip": 0.01421546, "auxiliary_loss_mlp": 0.0106988, "balance_loss_clip": 1.13127685, "balance_loss_mlp": 1.05115223, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 2.6273400874916653, "language_loss": 0.8772254, "learning_rate": 3.6812028124990075e-06, "loss": 0.90213966, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.557422637939453 }, { "auxiliary_loss_clip": 0.0141918, "auxiliary_loss_mlp": 0.01073971, "balance_loss_clip": 1.12891114, "balance_loss_mlp": 1.05548131, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 2.8986242324716294, "language_loss": 0.81670123, "learning_rate": 3.6856885602676016e-06, "loss": 0.84163272, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.571394205093384 }, { "auxiliary_loss_clip": 0.01430632, "auxiliary_loss_mlp": 0.01069152, "balance_loss_clip": 1.13442016, "balance_loss_mlp": 1.04999518, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 3.159001397831375, "language_loss": 0.94293463, "learning_rate": 3.6901467032597733e-06, "loss": 0.96793246, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.5990772247314453 }, { "auxiliary_loss_clip": 0.01423009, "auxiliary_loss_mlp": 0.01070837, "balance_loss_clip": 1.12974906, "balance_loss_mlp": 1.051072, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 2.119640135879872, "language_loss": 0.8749615, "learning_rate": 3.694577579151804e-06, "loss": 0.89989996, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.6559078693389893 }, { "auxiliary_loss_clip": 0.01419591, "auxiliary_loss_mlp": 0.01077024, "balance_loss_clip": 1.13154387, "balance_loss_mlp": 1.05790305, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 24.524710308639005, "language_loss": 0.73692065, "learning_rate": 3.6989815194616703e-06, "loss": 0.76188684, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.657019853591919 }, { "auxiliary_loss_clip": 0.01428943, "auxiliary_loss_mlp": 0.01066098, "balance_loss_clip": 1.13362384, "balance_loss_mlp": 1.04690564, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 2.398816831753953, "language_loss": 0.79931611, "learning_rate": 3.703358849697888e-06, "loss": 0.82426655, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.667200803756714 }, { "auxiliary_loss_clip": 0.01418677, "auxiliary_loss_mlp": 0.01068854, "balance_loss_clip": 1.13120079, "balance_loss_mlp": 1.0514617, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 3.089628923113304, "language_loss": 0.82608318, "learning_rate": 3.7077098895038803e-06, "loss": 0.85095853, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.65098237991333 }, { "auxiliary_loss_clip": 0.01422312, "auxiliary_loss_mlp": 0.0106119, "balance_loss_clip": 1.13055468, "balance_loss_mlp": 1.0426048, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 2.007966795344019, "language_loss": 0.97143447, "learning_rate": 3.712034952798045e-06, "loss": 0.99626952, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.5685195922851562 }, { "auxiliary_loss_clip": 0.01422459, "auxiliary_loss_mlp": 0.01071358, "balance_loss_clip": 1.1298666, "balance_loss_mlp": 1.05193865, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 2.2190764627820996, "language_loss": 0.8470583, "learning_rate": 3.7163343479096656e-06, "loss": 0.87199652, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.702793836593628 }, { "auxiliary_loss_clip": 0.01405235, "auxiliary_loss_mlp": 0.01071161, "balance_loss_clip": 1.12324238, "balance_loss_mlp": 1.05370879, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 4.002684992177676, "language_loss": 0.82808733, "learning_rate": 3.720608377710802e-06, "loss": 0.85285127, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 3.3532819747924805 }, { "auxiliary_loss_clip": 0.01410675, "auxiliary_loss_mlp": 0.01068737, "balance_loss_clip": 1.12391329, "balance_loss_mlp": 1.04986596, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 4.945543180689287, "language_loss": 0.86576974, "learning_rate": 3.7248573397443277e-06, "loss": 0.89056385, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 2.631410837173462 }, { "auxiliary_loss_clip": 0.0141607, "auxiliary_loss_mlp": 0.01060235, "balance_loss_clip": 1.13143539, "balance_loss_mlp": 1.04336691, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 2.021790414086321, "language_loss": 0.97713482, "learning_rate": 3.729081526348224e-06, "loss": 1.00189781, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 4.148843765258789 }, { "auxiliary_loss_clip": 0.01408567, "auxiliary_loss_mlp": 0.01066341, "balance_loss_clip": 1.12354517, "balance_loss_mlp": 1.04916263, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 1.8095065813638709, "language_loss": 0.84940994, "learning_rate": 3.7332812247762777e-06, "loss": 0.87415898, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 3.777919292449951 }, { "auxiliary_loss_clip": 0.01411052, "auxiliary_loss_mlp": 0.01069095, "balance_loss_clip": 1.12820864, "balance_loss_mlp": 1.05124903, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 2.2808659113988505, "language_loss": 0.95120305, "learning_rate": 3.737456717315293e-06, "loss": 0.97600454, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.6159298419952393 }, { "auxiliary_loss_clip": 0.01401879, "auxiliary_loss_mlp": 0.01077649, "balance_loss_clip": 1.12322628, "balance_loss_mlp": 1.05982709, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 1.7428217854056662, "language_loss": 0.90690953, "learning_rate": 3.7416082813989552e-06, "loss": 0.93170476, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.648185968399048 }, { "auxiliary_loss_clip": 0.01407783, "auxiliary_loss_mlp": 0.01066788, "balance_loss_clip": 1.12161112, "balance_loss_mlp": 1.04852498, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 1.9710511398244437, "language_loss": 0.89327276, "learning_rate": 3.745736189718439e-06, "loss": 0.91801846, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.5951342582702637 }, { "auxiliary_loss_clip": 0.01397411, "auxiliary_loss_mlp": 0.01063139, "balance_loss_clip": 1.11888957, "balance_loss_mlp": 1.04647326, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 6.055573310511585, "language_loss": 0.72492576, "learning_rate": 3.749840710329894e-06, "loss": 0.74953127, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.589200258255005 }, { "auxiliary_loss_clip": 0.01406321, "auxiliary_loss_mlp": 0.01063112, "balance_loss_clip": 1.12352002, "balance_loss_mlp": 1.04408622, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 2.6676046724362608, "language_loss": 0.98060441, "learning_rate": 3.7539221067588938e-06, "loss": 1.00529873, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.5173823833465576 }, { "auxiliary_loss_clip": 0.0140629, "auxiliary_loss_mlp": 0.01084874, "balance_loss_clip": 1.12297571, "balance_loss_mlp": 1.06589556, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 5.430752276085542, "language_loss": 0.93840122, "learning_rate": 3.757980638101964e-06, "loss": 0.96331292, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.5842807292938232 }, { "auxiliary_loss_clip": 0.01403066, "auxiliary_loss_mlp": 0.01067692, "balance_loss_clip": 1.1224407, "balance_loss_mlp": 1.04937005, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 2.0669926176854263, "language_loss": 0.89775437, "learning_rate": 3.7620165591252806e-06, "loss": 0.92246187, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 2.5774319171905518 }, { "auxiliary_loss_clip": 0.01400148, "auxiliary_loss_mlp": 0.0107207, "balance_loss_clip": 1.12314165, "balance_loss_mlp": 1.05657303, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 2.0648149266139626, "language_loss": 0.94337201, "learning_rate": 3.766030120360636e-06, "loss": 0.96809411, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.6220462322235107 }, { "auxiliary_loss_clip": 0.01398727, "auxiliary_loss_mlp": 0.01060231, "balance_loss_clip": 1.11917794, "balance_loss_mlp": 1.04252851, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 1.9933825787311061, "language_loss": 0.9031933, "learning_rate": 3.7700215681987578e-06, "loss": 0.92778289, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.568166494369507 }, { "auxiliary_loss_clip": 0.01397827, "auxiliary_loss_mlp": 0.01063926, "balance_loss_clip": 1.12121618, "balance_loss_mlp": 1.04586601, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 1.7083174013032125, "language_loss": 0.82206285, "learning_rate": 3.7739911449800767e-06, "loss": 0.84668034, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.6015920639038086 }, { "auxiliary_loss_clip": 0.01395285, "auxiliary_loss_mlp": 0.0106668, "balance_loss_clip": 1.11631227, "balance_loss_mlp": 1.04956162, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 3.3593239422888828, "language_loss": 0.80787718, "learning_rate": 3.7779390890830114e-06, "loss": 0.83249688, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.541952133178711 }, { "auxiliary_loss_clip": 0.01401212, "auxiliary_loss_mlp": 0.01063504, "balance_loss_clip": 1.11930692, "balance_loss_mlp": 1.04748225, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 1.9649897730512753, "language_loss": 0.85946041, "learning_rate": 3.7818656350098723e-06, "loss": 0.88410759, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.633087158203125 }, { "auxiliary_loss_clip": 0.01394853, "auxiliary_loss_mlp": 0.01067037, "balance_loss_clip": 1.11646259, "balance_loss_mlp": 1.04962111, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 2.5645972941324366, "language_loss": 0.77138811, "learning_rate": 3.7857710134704447e-06, "loss": 0.79600704, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.531020164489746 }, { "auxiliary_loss_clip": 0.0138888, "auxiliary_loss_mlp": 0.01068104, "balance_loss_clip": 1.11589277, "balance_loss_mlp": 1.05022252, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 2.2528827725049276, "language_loss": 0.79152197, "learning_rate": 3.7896554514633234e-06, "loss": 0.81609178, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.7522711753845215 }, { "auxiliary_loss_clip": 0.01384519, "auxiliary_loss_mlp": 0.01059279, "balance_loss_clip": 1.11232233, "balance_loss_mlp": 1.04256582, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 2.40896698141146, "language_loss": 0.83972859, "learning_rate": 3.7935191723550955e-06, "loss": 0.86416656, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.551440715789795 }, { "auxiliary_loss_clip": 0.01380961, "auxiliary_loss_mlp": 0.01057129, "balance_loss_clip": 1.11154723, "balance_loss_mlp": 1.04225135, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 2.202555841232679, "language_loss": 0.88383079, "learning_rate": 3.797362395957408e-06, "loss": 0.90821171, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.626603603363037 }, { "auxiliary_loss_clip": 0.01389374, "auxiliary_loss_mlp": 0.01060635, "balance_loss_clip": 1.11681533, "balance_loss_mlp": 1.04280114, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 4.161102080520976, "language_loss": 0.78231037, "learning_rate": 3.8011853386020055e-06, "loss": 0.80681044, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.5811970233917236 }, { "auxiliary_loss_clip": 0.01384862, "auxiliary_loss_mlp": 0.01057414, "balance_loss_clip": 1.1134975, "balance_loss_mlp": 1.03993785, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 2.540935842084887, "language_loss": 0.89846992, "learning_rate": 3.804988213213804e-06, "loss": 0.92289269, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.5411465167999268 }, { "auxiliary_loss_clip": 0.01355599, "auxiliary_loss_mlp": 0.01018091, "balance_loss_clip": 1.15855932, "balance_loss_mlp": 1.0087446, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.0216338981293598, "language_loss": 0.63189375, "learning_rate": 3.808771229382049e-06, "loss": 0.65563071, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.0271992683410645 }, { "auxiliary_loss_clip": 0.0137893, "auxiliary_loss_mlp": 0.01052618, "balance_loss_clip": 1.10847664, "balance_loss_mlp": 1.03685844, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 2.026556519369347, "language_loss": 0.84456909, "learning_rate": 3.8125345934296324e-06, "loss": 0.86888462, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.5397098064422607 }, { "auxiliary_loss_clip": 0.01387198, "auxiliary_loss_mlp": 0.0106634, "balance_loss_clip": 1.11238658, "balance_loss_mlp": 1.0498054, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.117772883896705, "language_loss": 0.8771137, "learning_rate": 3.81627850848061e-06, "loss": 0.90164912, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.5548768043518066 }, { "auxiliary_loss_clip": 0.01377763, "auxiliary_loss_mlp": 0.01070273, "balance_loss_clip": 1.10996997, "balance_loss_mlp": 1.05549145, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 2.175690275204565, "language_loss": 0.86286664, "learning_rate": 3.820003174525994e-06, "loss": 0.88734704, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.577684164047241 }, { "auxiliary_loss_clip": 0.01372954, "auxiliary_loss_mlp": 0.0105794, "balance_loss_clip": 1.10774279, "balance_loss_mlp": 1.04268706, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 2.380772844356794, "language_loss": 0.83011055, "learning_rate": 3.823708788487851e-06, "loss": 0.85441947, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.514930486679077 }, { "auxiliary_loss_clip": 0.01379229, "auxiliary_loss_mlp": 0.01074726, "balance_loss_clip": 1.1118772, "balance_loss_mlp": 1.05888867, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 1.888798046041017, "language_loss": 0.84661829, "learning_rate": 3.827395544281781e-06, "loss": 0.87115788, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 3.3496015071868896 }, { "auxiliary_loss_clip": 0.0137764, "auxiliary_loss_mlp": 0.01059592, "balance_loss_clip": 1.11045456, "balance_loss_mlp": 1.04311728, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 2.0412373988252392, "language_loss": 0.78996813, "learning_rate": 3.831063632877802e-06, "loss": 0.81434047, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 2.620748281478882 }, { "auxiliary_loss_clip": 0.01373728, "auxiliary_loss_mlp": 0.01060566, "balance_loss_clip": 1.11249316, "balance_loss_mlp": 1.04548562, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 2.4973835053823414, "language_loss": 0.75715494, "learning_rate": 3.834713242359712e-06, "loss": 0.78149784, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 4.188012599945068 }, { "auxiliary_loss_clip": 0.01372148, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.10652089, "balance_loss_mlp": 1.04019213, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 1.8406229148544657, "language_loss": 0.87191784, "learning_rate": 3.838344557982959e-06, "loss": 0.89619452, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 2.5780439376831055 }, { "auxiliary_loss_clip": 0.01367217, "auxiliary_loss_mlp": 0.01048915, "balance_loss_clip": 1.10380507, "balance_loss_mlp": 1.03335786, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 2.5556274138500843, "language_loss": 0.84963012, "learning_rate": 3.841957762231063e-06, "loss": 0.87379134, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.53367018699646 }, { "auxiliary_loss_clip": 0.01367393, "auxiliary_loss_mlp": 0.01065034, "balance_loss_clip": 1.10361958, "balance_loss_mlp": 1.04995394, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 2.372302389341696, "language_loss": 0.87688529, "learning_rate": 3.8455530348706454e-06, "loss": 0.90120959, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.5297961235046387 }, { "auxiliary_loss_clip": 0.01367834, "auxiliary_loss_mlp": 0.01056776, "balance_loss_clip": 1.10619771, "balance_loss_mlp": 1.04123139, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 2.291040841664121, "language_loss": 0.77661228, "learning_rate": 3.849130553005099e-06, "loss": 0.80085838, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.510061025619507 }, { "auxiliary_loss_clip": 0.01370629, "auxiliary_loss_mlp": 0.01058855, "balance_loss_clip": 1.10799456, "balance_loss_mlp": 1.04173672, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 1.6852667136055821, "language_loss": 0.83667123, "learning_rate": 3.852690491126933e-06, "loss": 0.86096609, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.5535197257995605 }, { "auxiliary_loss_clip": 0.01369096, "auxiliary_loss_mlp": 0.01065778, "balance_loss_clip": 1.10478115, "balance_loss_mlp": 1.04926753, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 2.269993467498683, "language_loss": 0.91096079, "learning_rate": 3.856233021168845e-06, "loss": 0.93530953, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.6161463260650635 }, { "auxiliary_loss_clip": 0.01361517, "auxiliary_loss_mlp": 0.01053096, "balance_loss_clip": 1.10084772, "balance_loss_mlp": 1.03875542, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.1339780736405505, "language_loss": 0.91457516, "learning_rate": 3.859758312553544e-06, "loss": 0.9387213, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 2.6812021732330322 }, { "auxiliary_loss_clip": 0.01367583, "auxiliary_loss_mlp": 0.01062321, "balance_loss_clip": 1.10557389, "balance_loss_mlp": 1.04638267, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 1.7928459392664282, "language_loss": 0.91850138, "learning_rate": 3.8632665322423735e-06, "loss": 0.9428004, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.5833051204681396 }, { "auxiliary_loss_clip": 0.01362237, "auxiliary_loss_mlp": 0.01078059, "balance_loss_clip": 1.10268044, "balance_loss_mlp": 1.0629549, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 1.647325932826466, "language_loss": 0.8595947, "learning_rate": 3.866757844782762e-06, "loss": 0.88399762, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.582639217376709 }, { "auxiliary_loss_clip": 0.01360574, "auxiliary_loss_mlp": 0.01072155, "balance_loss_clip": 1.10277367, "balance_loss_mlp": 1.05597878, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 2.4193652798123213, "language_loss": 0.91475713, "learning_rate": 3.870232412354527e-06, "loss": 0.93908441, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.6014459133148193 }, { "auxiliary_loss_clip": 0.01362131, "auxiliary_loss_mlp": 0.01055153, "balance_loss_clip": 1.10086262, "balance_loss_mlp": 1.04022813, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 2.3922685349550874, "language_loss": 0.92197978, "learning_rate": 3.873690394815086e-06, "loss": 0.94615269, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.541599988937378 }, { "auxiliary_loss_clip": 0.01360421, "auxiliary_loss_mlp": 0.01059907, "balance_loss_clip": 1.10148454, "balance_loss_mlp": 1.04430223, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 2.979794761301918, "language_loss": 0.91024959, "learning_rate": 3.877131949743587e-06, "loss": 0.93445277, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.559276819229126 }, { "auxiliary_loss_clip": 0.01358608, "auxiliary_loss_mlp": 0.01064336, "balance_loss_clip": 1.10031843, "balance_loss_mlp": 1.04845762, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 3.1962614535762737, "language_loss": 0.78099656, "learning_rate": 3.880557232483993e-06, "loss": 0.80522597, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.5640130043029785 }, { "auxiliary_loss_clip": 0.01353685, "auxiliary_loss_mlp": 0.01054975, "balance_loss_clip": 1.09528208, "balance_loss_mlp": 1.03931117, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 1.9011805707700036, "language_loss": 0.86806357, "learning_rate": 3.883966396187164e-06, "loss": 0.89215016, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.551992654800415 }, { "auxiliary_loss_clip": 0.01357932, "auxiliary_loss_mlp": 0.01055588, "balance_loss_clip": 1.10243177, "balance_loss_mlp": 1.04056728, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 2.1001890165969774, "language_loss": 0.89962023, "learning_rate": 3.887359591851937e-06, "loss": 0.92375541, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.522515296936035 }, { "auxiliary_loss_clip": 0.01349348, "auxiliary_loss_mlp": 0.0105735, "balance_loss_clip": 1.09714103, "balance_loss_mlp": 1.04279423, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 1.6143044097267194, "language_loss": 0.92470986, "learning_rate": 3.890736968365265e-06, "loss": 0.94877684, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.5590529441833496 }, { "auxiliary_loss_clip": 0.01351818, "auxiliary_loss_mlp": 0.01051289, "balance_loss_clip": 1.0962857, "balance_loss_mlp": 1.03622103, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 31.792986009628954, "language_loss": 0.85065651, "learning_rate": 3.894098672541412e-06, "loss": 0.87468767, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.563410997390747 }, { "auxiliary_loss_clip": 0.01352634, "auxiliary_loss_mlp": 0.01054062, "balance_loss_clip": 1.09759808, "balance_loss_mlp": 1.03844547, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 2.2379833621656857, "language_loss": 0.75454122, "learning_rate": 3.89744484916025e-06, "loss": 0.7786082, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.686225414276123 }, { "auxiliary_loss_clip": 0.01354128, "auxiliary_loss_mlp": 0.01069194, "balance_loss_clip": 1.09946012, "balance_loss_mlp": 1.05229056, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 1.8798467979218316, "language_loss": 0.87242806, "learning_rate": 3.900775641004673e-06, "loss": 0.89666128, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.5786094665527344 }, { "auxiliary_loss_clip": 0.01358598, "auxiliary_loss_mlp": 0.01054199, "balance_loss_clip": 1.10063612, "balance_loss_mlp": 1.03767681, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 3.5744107738675703, "language_loss": 0.74149132, "learning_rate": 3.904091188897156e-06, "loss": 0.76561928, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.706536293029785 }, { "auxiliary_loss_clip": 0.01352853, "auxiliary_loss_mlp": 0.01069619, "balance_loss_clip": 1.09920883, "balance_loss_mlp": 1.05455053, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.0829457919395837, "language_loss": 0.82206857, "learning_rate": 3.90739163173548e-06, "loss": 0.84629327, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.52390193939209 }, { "auxiliary_loss_clip": 0.01348959, "auxiliary_loss_mlp": 0.01058568, "balance_loss_clip": 1.09491587, "balance_loss_mlp": 1.04258204, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 2.2873343918620503, "language_loss": 0.88522542, "learning_rate": 3.910677106527646e-06, "loss": 0.90930068, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.527245044708252 }, { "auxiliary_loss_clip": 0.01348308, "auxiliary_loss_mlp": 0.01043507, "balance_loss_clip": 1.09672129, "balance_loss_mlp": 1.02927351, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 2.493035504738007, "language_loss": 0.84181643, "learning_rate": 3.913947748426004e-06, "loss": 0.86573458, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.5639421939849854 }, { "auxiliary_loss_clip": 0.01350787, "auxiliary_loss_mlp": 0.01058914, "balance_loss_clip": 1.09930432, "balance_loss_mlp": 1.0441798, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 2.8062866278377845, "language_loss": 0.76366138, "learning_rate": 3.9172036907606136e-06, "loss": 0.78775847, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 3.2294790744781494 }, { "auxiliary_loss_clip": 0.01348281, "auxiliary_loss_mlp": 0.01057183, "balance_loss_clip": 1.09565723, "balance_loss_mlp": 1.04123282, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.7010636237267593, "language_loss": 0.95084202, "learning_rate": 3.920445065071855e-06, "loss": 0.97489661, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 2.599256753921509 }, { "auxiliary_loss_clip": 0.01346965, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.09693027, "balance_loss_mlp": 1.03624368, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 3.41104465251325, "language_loss": 0.79954422, "learning_rate": 3.923672001142322e-06, "loss": 0.82351744, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 3.3217508792877197 }, { "auxiliary_loss_clip": 0.01345738, "auxiliary_loss_mlp": 0.01065658, "balance_loss_clip": 1.09679198, "balance_loss_mlp": 1.05164504, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 2.06788590509748, "language_loss": 0.84489751, "learning_rate": 3.926884627027996e-06, "loss": 0.86901146, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 3.4337024688720703 }, { "auxiliary_loss_clip": 0.01345855, "auxiliary_loss_mlp": 0.01062661, "balance_loss_clip": 1.09613228, "balance_loss_mlp": 1.04742622, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 1.9168353711792765, "language_loss": 0.77478689, "learning_rate": 3.930083069088744e-06, "loss": 0.79887205, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 2.621284246444702 }, { "auxiliary_loss_clip": 0.01332799, "auxiliary_loss_mlp": 0.01052352, "balance_loss_clip": 1.14926231, "balance_loss_mlp": 1.04462719, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 1.0095291513436848, "language_loss": 0.59378207, "learning_rate": 3.933267452018137e-06, "loss": 0.61763352, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.1514360904693604 }, { "auxiliary_loss_clip": 0.01346603, "auxiliary_loss_mlp": 0.01054454, "balance_loss_clip": 1.09743333, "balance_loss_mlp": 1.04082239, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 2.754758500916086, "language_loss": 0.84264541, "learning_rate": 3.936437898872622e-06, "loss": 0.86665606, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.5734877586364746 }, { "auxiliary_loss_clip": 0.01343899, "auxiliary_loss_mlp": 0.01059172, "balance_loss_clip": 1.09582353, "balance_loss_mlp": 1.04387748, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 2.8586147508221447, "language_loss": 0.79840773, "learning_rate": 3.9395945311000525e-06, "loss": 0.82243842, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.6784613132476807 }, { "auxiliary_loss_clip": 0.01347301, "auxiliary_loss_mlp": 0.01053951, "balance_loss_clip": 1.0970757, "balance_loss_mlp": 1.03782201, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 2.0717027991475674, "language_loss": 0.90772343, "learning_rate": 3.942737468567608e-06, "loss": 0.93173593, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.5196847915649414 }, { "auxiliary_loss_clip": 0.01340656, "auxiliary_loss_mlp": 0.01053785, "balance_loss_clip": 1.0944767, "balance_loss_mlp": 1.03961062, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 2.2758480233354046, "language_loss": 0.86153758, "learning_rate": 3.9458668295891026e-06, "loss": 0.88548195, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.766575574874878 }, { "auxiliary_loss_clip": 0.01336423, "auxiliary_loss_mlp": 0.01051812, "balance_loss_clip": 1.09038424, "balance_loss_mlp": 1.03586173, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 2.311099590148783, "language_loss": 0.86897814, "learning_rate": 3.948982730951712e-06, "loss": 0.89286053, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.513349771499634 }, { "auxiliary_loss_clip": 0.01342793, "auxiliary_loss_mlp": 0.01062367, "balance_loss_clip": 1.09354448, "balance_loss_mlp": 1.04621458, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 2.315154096288921, "language_loss": 0.81818986, "learning_rate": 3.9520852879421254e-06, "loss": 0.84224153, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.486950397491455 }, { "auxiliary_loss_clip": 0.01341031, "auxiliary_loss_mlp": 0.01055023, "balance_loss_clip": 1.09608364, "balance_loss_mlp": 1.03999102, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 2.197228528694378, "language_loss": 0.81636643, "learning_rate": 3.955174614372137e-06, "loss": 0.84032691, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.6110587120056152 }, { "auxiliary_loss_clip": 0.01339849, "auxiliary_loss_mlp": 0.01058273, "balance_loss_clip": 1.09528136, "balance_loss_mlp": 1.04339015, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 3.6045207715356704, "language_loss": 0.84285545, "learning_rate": 3.9582508226037045e-06, "loss": 0.86683667, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.539132833480835 }, { "auxiliary_loss_clip": 0.01342671, "auxiliary_loss_mlp": 0.01060793, "balance_loss_clip": 1.09393406, "balance_loss_mlp": 1.04388952, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 2.331757055494177, "language_loss": 0.94015527, "learning_rate": 3.9613140235734636e-06, "loss": 0.96418989, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.526366710662842 }, { "auxiliary_loss_clip": 0.01342218, "auxiliary_loss_mlp": 0.01065478, "balance_loss_clip": 1.09648585, "balance_loss_mlp": 1.05064869, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 1.8613490746181172, "language_loss": 0.81386042, "learning_rate": 3.96436432681674e-06, "loss": 0.83793736, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.4888927936553955 }, { "auxiliary_loss_clip": 0.01340708, "auxiliary_loss_mlp": 0.01066483, "balance_loss_clip": 1.0949719, "balance_loss_mlp": 1.05084896, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 2.1996843196600864, "language_loss": 0.89110518, "learning_rate": 3.967401840491044e-06, "loss": 0.91517705, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.601609945297241 }, { "auxiliary_loss_clip": 0.01339571, "auxiliary_loss_mlp": 0.01065391, "balance_loss_clip": 1.09678435, "balance_loss_mlp": 1.04994774, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.440108628282799, "language_loss": 0.87792104, "learning_rate": 3.97042667139909e-06, "loss": 0.90197068, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.5748629570007324 }, { "auxiliary_loss_clip": 0.0134225, "auxiliary_loss_mlp": 0.01061305, "balance_loss_clip": 1.09688139, "balance_loss_mlp": 1.04628491, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 1.970998699805061, "language_loss": 0.87439859, "learning_rate": 3.973438925011327e-06, "loss": 0.89843416, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.561873435974121 }, { "auxiliary_loss_clip": 0.01333938, "auxiliary_loss_mlp": 0.01049484, "balance_loss_clip": 1.08921599, "balance_loss_mlp": 1.03589988, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 2.6350301267290575, "language_loss": 0.91208136, "learning_rate": 3.976438705488002e-06, "loss": 0.93591559, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.5613090991973877 }, { "auxiliary_loss_clip": 0.01338429, "auxiliary_loss_mlp": 0.01064035, "balance_loss_clip": 1.09511232, "balance_loss_mlp": 1.04922879, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 2.7389999747801, "language_loss": 0.93144369, "learning_rate": 3.9794261157007744e-06, "loss": 0.9554683, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.5738015174865723 }, { "auxiliary_loss_clip": 0.01341395, "auxiliary_loss_mlp": 0.01057949, "balance_loss_clip": 1.09806585, "balance_loss_mlp": 1.04179573, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.211819095413116, "language_loss": 0.8448602, "learning_rate": 3.982401257253887e-06, "loss": 0.86885363, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.546577215194702 }, { "auxiliary_loss_clip": 0.01334843, "auxiliary_loss_mlp": 0.01060225, "balance_loss_clip": 1.09381151, "balance_loss_mlp": 1.0452404, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 2.4162360068181763, "language_loss": 0.90021223, "learning_rate": 3.985364230504893e-06, "loss": 0.92416298, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.565833806991577 }, { "auxiliary_loss_clip": 0.01337186, "auxiliary_loss_mlp": 0.01055832, "balance_loss_clip": 1.09351432, "balance_loss_mlp": 1.04033518, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 1.9058354586502553, "language_loss": 0.8429026, "learning_rate": 3.988315134584976e-06, "loss": 0.86683273, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.5868866443634033 }, { "auxiliary_loss_clip": 0.01339891, "auxiliary_loss_mlp": 0.01054593, "balance_loss_clip": 1.09684002, "balance_loss_mlp": 1.03995454, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 2.2153913035327677, "language_loss": 0.80170518, "learning_rate": 3.991254067418851e-06, "loss": 0.82564998, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.5695528984069824 }, { "auxiliary_loss_clip": 0.01334671, "auxiliary_loss_mlp": 0.01057435, "balance_loss_clip": 1.0934633, "balance_loss_mlp": 1.0428319, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 2.325894301640739, "language_loss": 0.82863331, "learning_rate": 3.994181125744254e-06, "loss": 0.85255432, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.63382625579834 }, { "auxiliary_loss_clip": 0.01337917, "auxiliary_loss_mlp": 0.01060942, "balance_loss_clip": 1.09625649, "balance_loss_mlp": 1.04687572, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 1.7884191148268913, "language_loss": 0.73850513, "learning_rate": 3.99709640513106e-06, "loss": 0.76249373, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 3.318661689758301 }, { "auxiliary_loss_clip": 0.01338753, "auxiliary_loss_mlp": 0.0104968, "balance_loss_clip": 1.09465575, "balance_loss_mlp": 1.03401566, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 1.9110357881473876, "language_loss": 0.85603315, "learning_rate": 4e-06, "loss": 0.87991744, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 2.5853569507598877 }, { "auxiliary_loss_clip": 0.01339696, "auxiliary_loss_mlp": 0.01068033, "balance_loss_clip": 1.09865189, "balance_loss_mlp": 1.05344224, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 5.981262042134408, "language_loss": 0.88129616, "learning_rate": 3.999999848300794e-06, "loss": 0.90537345, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 3.255072832107544 }, { "auxiliary_loss_clip": 0.01330771, "auxiliary_loss_mlp": 0.01052639, "balance_loss_clip": 1.0906322, "balance_loss_mlp": 1.03764296, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 1.6568302554024308, "language_loss": 0.89170271, "learning_rate": 3.999999393203203e-06, "loss": 0.91553676, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 4.091420888900757 }, { "auxiliary_loss_clip": 0.01328956, "auxiliary_loss_mlp": 0.01057486, "balance_loss_clip": 1.09000897, "balance_loss_mlp": 1.04259706, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 2.001010413322784, "language_loss": 0.85005176, "learning_rate": 3.999998634707293e-06, "loss": 0.87391627, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 2.520881175994873 }, { "auxiliary_loss_clip": 0.01337847, "auxiliary_loss_mlp": 0.01063578, "balance_loss_clip": 1.09796453, "balance_loss_mlp": 1.04772305, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 2.3288344351860335, "language_loss": 0.96367872, "learning_rate": 3.999997572813182e-06, "loss": 0.98769295, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 2.561330795288086 }, { "auxiliary_loss_clip": 0.01336374, "auxiliary_loss_mlp": 0.01055275, "balance_loss_clip": 1.09416366, "balance_loss_mlp": 1.04055882, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 1.963566879368455, "language_loss": 0.87522596, "learning_rate": 3.999996207521028e-06, "loss": 0.89914238, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.544182777404785 }, { "auxiliary_loss_clip": 0.01333335, "auxiliary_loss_mlp": 0.0105334, "balance_loss_clip": 1.09279597, "balance_loss_mlp": 1.0371753, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.586974589679778, "language_loss": 0.82402045, "learning_rate": 3.999994538831039e-06, "loss": 0.84788722, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.546891927719116 }, { "auxiliary_loss_clip": 0.01331392, "auxiliary_loss_mlp": 0.01065701, "balance_loss_clip": 1.09196126, "balance_loss_mlp": 1.04934525, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 2.5537645168596086, "language_loss": 0.85794568, "learning_rate": 3.99999256674347e-06, "loss": 0.88191664, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.5956575870513916 }, { "auxiliary_loss_clip": 0.01307763, "auxiliary_loss_mlp": 0.01021069, "balance_loss_clip": 1.14026558, "balance_loss_mlp": 1.01377344, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.0142093281151994, "language_loss": 0.53576285, "learning_rate": 3.999990291258618e-06, "loss": 0.55905116, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.090458631515503 }, { "auxiliary_loss_clip": 0.01332934, "auxiliary_loss_mlp": 0.01055239, "balance_loss_clip": 1.09403229, "balance_loss_mlp": 1.04191768, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 2.224334568279, "language_loss": 0.86729789, "learning_rate": 3.999987712376829e-06, "loss": 0.89117956, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.526338815689087 }, { "auxiliary_loss_clip": 0.0133238, "auxiliary_loss_mlp": 0.01051043, "balance_loss_clip": 1.09578979, "balance_loss_mlp": 1.03623128, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 1.8755625018424293, "language_loss": 0.81756967, "learning_rate": 3.999984830098494e-06, "loss": 0.84140396, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.5190436840057373 }, { "auxiliary_loss_clip": 0.01332976, "auxiliary_loss_mlp": 0.01068433, "balance_loss_clip": 1.09351707, "balance_loss_mlp": 1.05230451, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 2.966091894560407, "language_loss": 0.97791523, "learning_rate": 3.999981644424051e-06, "loss": 1.00192928, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.499943494796753 }, { "auxiliary_loss_clip": 0.0133133, "auxiliary_loss_mlp": 0.01054499, "balance_loss_clip": 1.09331894, "balance_loss_mlp": 1.03965735, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.3578316503244006, "language_loss": 0.86097372, "learning_rate": 3.999978155353982e-06, "loss": 0.88483196, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.4861016273498535 }, { "auxiliary_loss_clip": 0.01328989, "auxiliary_loss_mlp": 0.01054803, "balance_loss_clip": 1.09129882, "balance_loss_mlp": 1.04084957, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 2.3171670857925766, "language_loss": 0.80099583, "learning_rate": 3.9999743628888186e-06, "loss": 0.82483381, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.645575761795044 }, { "auxiliary_loss_clip": 0.01330153, "auxiliary_loss_mlp": 0.01044827, "balance_loss_clip": 1.09274089, "balance_loss_mlp": 1.02996206, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 2.356165196941417, "language_loss": 0.8960517, "learning_rate": 3.999970267029133e-06, "loss": 0.91980159, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.548417568206787 }, { "auxiliary_loss_clip": 0.01327689, "auxiliary_loss_mlp": 0.01053892, "balance_loss_clip": 1.0919385, "balance_loss_mlp": 1.04015875, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 2.0181700796910316, "language_loss": 0.80020487, "learning_rate": 3.999965867775548e-06, "loss": 0.82402062, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.5712244510650635 }, { "auxiliary_loss_clip": 0.01327563, "auxiliary_loss_mlp": 0.01057608, "balance_loss_clip": 1.09111202, "balance_loss_mlp": 1.04301703, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 3.6718332502600215, "language_loss": 0.86877465, "learning_rate": 3.9999611651287315e-06, "loss": 0.8926264, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.5207462310791016 }, { "auxiliary_loss_clip": 0.01333813, "auxiliary_loss_mlp": 0.01056239, "balance_loss_clip": 1.09465802, "balance_loss_mlp": 1.04134941, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 2.4714967286756897, "language_loss": 0.78752661, "learning_rate": 3.999956159089396e-06, "loss": 0.81142712, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.53702974319458 }, { "auxiliary_loss_clip": 0.01328852, "auxiliary_loss_mlp": 0.01064132, "balance_loss_clip": 1.09336877, "balance_loss_mlp": 1.04919529, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.1904948433913956, "language_loss": 0.79897857, "learning_rate": 3.999950849658302e-06, "loss": 0.8229084, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.551652669906616 }, { "auxiliary_loss_clip": 0.01330574, "auxiliary_loss_mlp": 0.01067854, "balance_loss_clip": 1.09451795, "balance_loss_mlp": 1.05218983, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 6.056220652155281, "language_loss": 0.84342527, "learning_rate": 3.999945236836254e-06, "loss": 0.86740947, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.5469486713409424 }, { "auxiliary_loss_clip": 0.01334459, "auxiliary_loss_mlp": 0.01077234, "balance_loss_clip": 1.09569645, "balance_loss_mlp": 1.06096208, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 2.8572679751800947, "language_loss": 0.94941884, "learning_rate": 3.999939320624103e-06, "loss": 0.97353578, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.477614402770996 }, { "auxiliary_loss_clip": 0.01330925, "auxiliary_loss_mlp": 0.01053959, "balance_loss_clip": 1.09552705, "balance_loss_mlp": 1.03952253, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 2.917799163489746, "language_loss": 0.90128386, "learning_rate": 3.999933101022749e-06, "loss": 0.92513269, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.5571577548980713 }, { "auxiliary_loss_clip": 0.01331097, "auxiliary_loss_mlp": 0.01056021, "balance_loss_clip": 1.0959301, "balance_loss_mlp": 1.04074454, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 2.119537439640371, "language_loss": 0.86997116, "learning_rate": 3.999926578033132e-06, "loss": 0.89384234, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.542473554611206 }, { "auxiliary_loss_clip": 0.01327461, "auxiliary_loss_mlp": 0.01059856, "balance_loss_clip": 1.08978271, "balance_loss_mlp": 1.04360783, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 2.0921552073934606, "language_loss": 0.62907565, "learning_rate": 3.999919751656244e-06, "loss": 0.6529488, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.789545774459839 }, { "auxiliary_loss_clip": 0.01328661, "auxiliary_loss_mlp": 0.01066036, "balance_loss_clip": 1.09290957, "balance_loss_mlp": 1.05144453, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 7.65831127895052, "language_loss": 0.7600857, "learning_rate": 3.9999126218931195e-06, "loss": 0.7840327, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.532543182373047 }, { "auxiliary_loss_clip": 0.0133317, "auxiliary_loss_mlp": 0.01065876, "balance_loss_clip": 1.09775794, "balance_loss_mlp": 1.05059338, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.2202396544236116, "language_loss": 0.89606571, "learning_rate": 3.99990518874484e-06, "loss": 0.9200561, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 3.283907413482666 }, { "auxiliary_loss_clip": 0.01334622, "auxiliary_loss_mlp": 0.01058733, "balance_loss_clip": 1.09773219, "balance_loss_mlp": 1.04416609, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 2.1791699189992433, "language_loss": 0.92370325, "learning_rate": 3.999897452212534e-06, "loss": 0.94763684, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 2.5228428840637207 }, { "auxiliary_loss_clip": 0.01325674, "auxiliary_loss_mlp": 0.01065248, "balance_loss_clip": 1.09222293, "balance_loss_mlp": 1.04955959, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 2.1790188640933073, "language_loss": 1.00224972, "learning_rate": 3.999889412297374e-06, "loss": 1.02615881, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 2.558551788330078 }, { "auxiliary_loss_clip": 0.01329355, "auxiliary_loss_mlp": 0.0105818, "balance_loss_clip": 1.09278548, "balance_loss_mlp": 1.04369628, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 1.9550817178253963, "language_loss": 0.78991795, "learning_rate": 3.999881069000581e-06, "loss": 0.8137933, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 2.5984652042388916 }, { "auxiliary_loss_clip": 0.01324833, "auxiliary_loss_mlp": 0.0105935, "balance_loss_clip": 1.08919489, "balance_loss_mlp": 1.04443717, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 3.770273553066629, "language_loss": 0.86596984, "learning_rate": 3.99987242232342e-06, "loss": 0.88981164, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 4.056426048278809 }, { "auxiliary_loss_clip": 0.01328504, "auxiliary_loss_mlp": 0.01059067, "balance_loss_clip": 1.09599733, "balance_loss_mlp": 1.04389191, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 1.7795316541151167, "language_loss": 0.79775548, "learning_rate": 3.9998634722672026e-06, "loss": 0.82163119, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.554086446762085 }, { "auxiliary_loss_clip": 0.0133201, "auxiliary_loss_mlp": 0.01052625, "balance_loss_clip": 1.09756231, "balance_loss_mlp": 1.03718758, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 2.2206088407616855, "language_loss": 0.78592384, "learning_rate": 3.999854218833286e-06, "loss": 0.80977017, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.6415319442749023 }, { "auxiliary_loss_clip": 0.01326787, "auxiliary_loss_mlp": 0.01061746, "balance_loss_clip": 1.09236896, "balance_loss_mlp": 1.04797077, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 2.373221073650071, "language_loss": 0.82169604, "learning_rate": 3.999844662023075e-06, "loss": 0.84558141, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.566725492477417 }, { "auxiliary_loss_clip": 0.01322713, "auxiliary_loss_mlp": 0.01050863, "balance_loss_clip": 1.09205127, "balance_loss_mlp": 1.0379827, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 1.7683111561381406, "language_loss": 0.91957229, "learning_rate": 3.999834801838018e-06, "loss": 0.94330812, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.5180513858795166 }, { "auxiliary_loss_clip": 0.01324341, "auxiliary_loss_mlp": 0.01049379, "balance_loss_clip": 1.0922296, "balance_loss_mlp": 1.03497267, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 1.852058272429499, "language_loss": 0.73851526, "learning_rate": 3.9998246382796115e-06, "loss": 0.76225239, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.552903890609741 }, { "auxiliary_loss_clip": 0.01324018, "auxiliary_loss_mlp": 0.01054897, "balance_loss_clip": 1.09035015, "balance_loss_mlp": 1.03925633, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 2.0351008456041066, "language_loss": 0.90830219, "learning_rate": 3.999814171349399e-06, "loss": 0.93209136, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.4935922622680664 }, { "auxiliary_loss_clip": 0.01321396, "auxiliary_loss_mlp": 0.01074059, "balance_loss_clip": 1.09098375, "balance_loss_mlp": 1.06048715, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 2.012974160584163, "language_loss": 0.73750973, "learning_rate": 3.9998034010489655e-06, "loss": 0.7614643, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.6690914630889893 }, { "auxiliary_loss_clip": 0.01323152, "auxiliary_loss_mlp": 0.01039788, "balance_loss_clip": 1.0927459, "balance_loss_mlp": 1.02609694, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.0071599458032527, "language_loss": 0.75688303, "learning_rate": 3.999792327379946e-06, "loss": 0.78051251, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 2.553694486618042 }, { "auxiliary_loss_clip": 0.01322951, "auxiliary_loss_mlp": 0.01061798, "balance_loss_clip": 1.0925988, "balance_loss_mlp": 1.04723108, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.113283984984685, "language_loss": 0.96186984, "learning_rate": 3.999780950344021e-06, "loss": 0.9857173, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.5291478633880615 }, { "auxiliary_loss_clip": 0.01324364, "auxiliary_loss_mlp": 0.0105475, "balance_loss_clip": 1.09139669, "balance_loss_mlp": 1.03870451, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 1.9985061410834943, "language_loss": 0.82529783, "learning_rate": 3.999769269942916e-06, "loss": 0.84908897, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.532067060470581 }, { "auxiliary_loss_clip": 0.0132125, "auxiliary_loss_mlp": 0.01062755, "balance_loss_clip": 1.09004462, "balance_loss_mlp": 1.04876018, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 2.1282564810361606, "language_loss": 0.80963022, "learning_rate": 3.999757286178402e-06, "loss": 0.83347034, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.603459119796753 }, { "auxiliary_loss_clip": 0.01324975, "auxiliary_loss_mlp": 0.01063681, "balance_loss_clip": 1.09452963, "balance_loss_mlp": 1.04955435, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 2.057064719793104, "language_loss": 0.90830946, "learning_rate": 3.999744999052299e-06, "loss": 0.93219602, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.5722527503967285 }, { "auxiliary_loss_clip": 0.01275081, "auxiliary_loss_mlp": 0.01046138, "balance_loss_clip": 1.11859703, "balance_loss_mlp": 1.03955734, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.9682003665864284, "language_loss": 0.61191434, "learning_rate": 3.9997324085664675e-06, "loss": 0.63512653, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.109800100326538 }, { "auxiliary_loss_clip": 0.01318551, "auxiliary_loss_mlp": 0.0106171, "balance_loss_clip": 1.08691382, "balance_loss_mlp": 1.04717851, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 2.405873206927614, "language_loss": 0.92186075, "learning_rate": 3.999719514722821e-06, "loss": 0.94566333, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.5438714027404785 }, { "auxiliary_loss_clip": 0.01318037, "auxiliary_loss_mlp": 0.01052422, "balance_loss_clip": 1.08970761, "balance_loss_mlp": 1.03914189, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 13.61778190660226, "language_loss": 0.74897373, "learning_rate": 3.999706317523314e-06, "loss": 0.77267826, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.6228296756744385 }, { "auxiliary_loss_clip": 0.01320034, "auxiliary_loss_mlp": 0.01047499, "balance_loss_clip": 1.09124923, "balance_loss_mlp": 1.03434408, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.400715673874234, "language_loss": 0.8594321, "learning_rate": 3.999692816969948e-06, "loss": 0.88310748, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.5144224166870117 }, { "auxiliary_loss_clip": 0.01266356, "auxiliary_loss_mlp": 0.01008995, "balance_loss_clip": 1.11271596, "balance_loss_mlp": 1.00222421, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 1.001732724736107, "language_loss": 0.6940074, "learning_rate": 3.999679013064772e-06, "loss": 0.71676087, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.134211778640747 }, { "auxiliary_loss_clip": 0.01317803, "auxiliary_loss_mlp": 0.01067996, "balance_loss_clip": 1.08846796, "balance_loss_mlp": 1.05359507, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 2.714939384158462, "language_loss": 0.85783637, "learning_rate": 3.99966490580988e-06, "loss": 0.88169432, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.532797336578369 }, { "auxiliary_loss_clip": 0.01322419, "auxiliary_loss_mlp": 0.01056891, "balance_loss_clip": 1.09243202, "balance_loss_mlp": 1.04200768, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 2.0070251973319766, "language_loss": 0.6565522, "learning_rate": 3.999650495207411e-06, "loss": 0.6803453, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.711317300796509 }, { "auxiliary_loss_clip": 0.01319406, "auxiliary_loss_mlp": 0.01058953, "balance_loss_clip": 1.09167385, "balance_loss_mlp": 1.04460025, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 2.608592937394606, "language_loss": 0.90209717, "learning_rate": 3.999635781259553e-06, "loss": 0.92588079, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.5290775299072266 }, { "auxiliary_loss_clip": 0.01256287, "auxiliary_loss_mlp": 0.01015083, "balance_loss_clip": 1.10583234, "balance_loss_mlp": 1.00845551, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.9175789158310258, "language_loss": 0.52284563, "learning_rate": 3.999620763968535e-06, "loss": 0.54555935, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 2.944399833679199 }, { "auxiliary_loss_clip": 0.0132123, "auxiliary_loss_mlp": 0.01065524, "balance_loss_clip": 1.09450316, "balance_loss_mlp": 1.05258417, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 1.8334650054517432, "language_loss": 0.86413467, "learning_rate": 3.999605443336638e-06, "loss": 0.88800228, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.5617876052856445 }, { "auxiliary_loss_clip": 0.01323598, "auxiliary_loss_mlp": 0.01045721, "balance_loss_clip": 1.09255791, "balance_loss_mlp": 1.03193998, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 2.827447195240601, "language_loss": 0.8958323, "learning_rate": 3.999589819366185e-06, "loss": 0.91952544, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 2.4725828170776367 }, { "auxiliary_loss_clip": 0.01317267, "auxiliary_loss_mlp": 0.01061278, "balance_loss_clip": 1.087165, "balance_loss_mlp": 1.04630578, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 2.198552807054148, "language_loss": 0.84808797, "learning_rate": 3.999573892059547e-06, "loss": 0.87187338, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 3.301636219024658 }, { "auxiliary_loss_clip": 0.0132151, "auxiliary_loss_mlp": 0.01055352, "balance_loss_clip": 1.09059536, "balance_loss_mlp": 1.03958058, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 1.9487025598848489, "language_loss": 0.81258619, "learning_rate": 3.999557661419138e-06, "loss": 0.83635473, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 2.574895143508911 }, { "auxiliary_loss_clip": 0.01324041, "auxiliary_loss_mlp": 0.01047224, "balance_loss_clip": 1.09500861, "balance_loss_mlp": 1.03371191, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 1.8242542276275375, "language_loss": 0.81441307, "learning_rate": 3.9995411274474225e-06, "loss": 0.83812571, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 4.004798650741577 }, { "auxiliary_loss_clip": 0.0132233, "auxiliary_loss_mlp": 0.01052237, "balance_loss_clip": 1.09319925, "balance_loss_mlp": 1.03687048, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 2.0142004482064397, "language_loss": 0.81508338, "learning_rate": 3.999524290146908e-06, "loss": 0.83882904, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 3.3743178844451904 }, { "auxiliary_loss_clip": 0.0132331, "auxiliary_loss_mlp": 0.01049878, "balance_loss_clip": 1.09614086, "balance_loss_mlp": 1.03555536, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 2.1339219922187103, "language_loss": 0.92245024, "learning_rate": 3.9995071495201485e-06, "loss": 0.94618213, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.5409111976623535 }, { "auxiliary_loss_clip": 0.01319063, "auxiliary_loss_mlp": 0.01057471, "balance_loss_clip": 1.09215701, "balance_loss_mlp": 1.04300463, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 2.6638064577893363, "language_loss": 0.97833025, "learning_rate": 3.999489705569744e-06, "loss": 1.00209558, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.5393967628479004 }, { "auxiliary_loss_clip": 0.01315008, "auxiliary_loss_mlp": 0.01052987, "balance_loss_clip": 1.08826971, "balance_loss_mlp": 1.03765678, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 2.302844709494629, "language_loss": 0.8655858, "learning_rate": 3.999471958298341e-06, "loss": 0.88926584, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.5324020385742188 }, { "auxiliary_loss_clip": 0.01324175, "auxiliary_loss_mlp": 0.0105484, "balance_loss_clip": 1.09257567, "balance_loss_mlp": 1.03959358, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 1.7614688527919695, "language_loss": 0.76180172, "learning_rate": 3.999453907708631e-06, "loss": 0.78559184, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.6303646564483643 }, { "auxiliary_loss_clip": 0.01317565, "auxiliary_loss_mlp": 0.01056307, "balance_loss_clip": 1.09066117, "balance_loss_mlp": 1.04166198, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 2.113059096009357, "language_loss": 0.81386268, "learning_rate": 3.999435553803353e-06, "loss": 0.83760142, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.5118112564086914 }, { "auxiliary_loss_clip": 0.01316403, "auxiliary_loss_mlp": 0.01058637, "balance_loss_clip": 1.08986211, "balance_loss_mlp": 1.04271078, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 2.7524671069512125, "language_loss": 0.83355778, "learning_rate": 3.999416896585292e-06, "loss": 0.85730821, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.5100231170654297 }, { "auxiliary_loss_clip": 0.01317708, "auxiliary_loss_mlp": 0.01054512, "balance_loss_clip": 1.09037626, "balance_loss_mlp": 1.04100585, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 4.403190873109443, "language_loss": 0.85844338, "learning_rate": 3.9993979360572775e-06, "loss": 0.88216555, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.538710355758667 }, { "auxiliary_loss_clip": 0.01327224, "auxiliary_loss_mlp": 0.01054335, "balance_loss_clip": 1.09789264, "balance_loss_mlp": 1.03933239, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 2.8007867887371876, "language_loss": 0.82909524, "learning_rate": 3.999378672222185e-06, "loss": 0.85291082, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.4923055171966553 }, { "auxiliary_loss_clip": 0.01316873, "auxiliary_loss_mlp": 0.01059032, "balance_loss_clip": 1.09186316, "balance_loss_mlp": 1.04442859, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 2.0383167509445723, "language_loss": 0.82744318, "learning_rate": 3.9993591050829385e-06, "loss": 0.85120225, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.517538547515869 }, { "auxiliary_loss_clip": 0.01318752, "auxiliary_loss_mlp": 0.01063091, "balance_loss_clip": 1.09249866, "balance_loss_mlp": 1.04840457, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 2.1169643942368284, "language_loss": 0.79442149, "learning_rate": 3.999339234642506e-06, "loss": 0.81823993, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.515993595123291 }, { "auxiliary_loss_clip": 0.01318907, "auxiliary_loss_mlp": 0.01050864, "balance_loss_clip": 1.09165907, "balance_loss_mlp": 1.0359869, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 1.9164393816485623, "language_loss": 0.83559942, "learning_rate": 3.9993190609038994e-06, "loss": 0.8592971, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.5499019622802734 }, { "auxiliary_loss_clip": 0.0131668, "auxiliary_loss_mlp": 0.01058768, "balance_loss_clip": 1.09112442, "balance_loss_mlp": 1.04456484, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 1.838685332448648, "language_loss": 0.83224481, "learning_rate": 3.999298583870182e-06, "loss": 0.85599929, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.5035042762756348 }, { "auxiliary_loss_clip": 0.01319901, "auxiliary_loss_mlp": 0.01055421, "balance_loss_clip": 1.09371233, "balance_loss_mlp": 1.04067469, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 1.8943434422359402, "language_loss": 0.77685165, "learning_rate": 3.999277803544458e-06, "loss": 0.80060482, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.5568432807922363 }, { "auxiliary_loss_clip": 0.01233894, "auxiliary_loss_mlp": 0.01031339, "balance_loss_clip": 1.08934855, "balance_loss_mlp": 1.024997, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9513375427893127, "language_loss": 0.62348098, "learning_rate": 3.999256719929882e-06, "loss": 0.64613324, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.072059154510498 }, { "auxiliary_loss_clip": 0.01231506, "auxiliary_loss_mlp": 0.01018007, "balance_loss_clip": 1.08758235, "balance_loss_mlp": 1.01176012, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.2112025694047706, "language_loss": 0.67128277, "learning_rate": 3.999235333029651e-06, "loss": 0.69377792, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.005730390548706 }, { "auxiliary_loss_clip": 0.01318229, "auxiliary_loss_mlp": 0.01057503, "balance_loss_clip": 1.09297633, "balance_loss_mlp": 1.04427636, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 1.8296146687539867, "language_loss": 0.82062113, "learning_rate": 3.999213642847009e-06, "loss": 0.84437847, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.5541253089904785 }, { "auxiliary_loss_clip": 0.01319212, "auxiliary_loss_mlp": 0.01051964, "balance_loss_clip": 1.09297347, "balance_loss_mlp": 1.03786135, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 1.6981699957530276, "language_loss": 0.90994018, "learning_rate": 3.999191649385247e-06, "loss": 0.93365192, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.5618481636047363 }, { "auxiliary_loss_clip": 0.01225398, "auxiliary_loss_mlp": 0.01014446, "balance_loss_clip": 1.08278096, "balance_loss_mlp": 1.00834227, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.9110791348930323, "language_loss": 0.59806377, "learning_rate": 3.999169352647702e-06, "loss": 0.62046224, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.046316385269165 }, { "auxiliary_loss_clip": 0.01319852, "auxiliary_loss_mlp": 0.01058214, "balance_loss_clip": 1.09322512, "balance_loss_mlp": 1.04365218, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 1.7755059170984826, "language_loss": 0.82945853, "learning_rate": 3.999146752637755e-06, "loss": 0.85323918, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.539952278137207 }, { "auxiliary_loss_clip": 0.013157, "auxiliary_loss_mlp": 0.01051247, "balance_loss_clip": 1.08970344, "balance_loss_mlp": 1.03590441, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.462607271883047, "language_loss": 0.89657652, "learning_rate": 3.999123849358836e-06, "loss": 0.92024595, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.504488229751587 }, { "auxiliary_loss_clip": 0.01312834, "auxiliary_loss_mlp": 0.0105805, "balance_loss_clip": 1.08887684, "balance_loss_mlp": 1.04446626, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 1.9864590551152903, "language_loss": 0.74770623, "learning_rate": 3.999100642814418e-06, "loss": 0.77141511, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.533271074295044 }, { "auxiliary_loss_clip": 0.01314575, "auxiliary_loss_mlp": 0.01065208, "balance_loss_clip": 1.09086835, "balance_loss_mlp": 1.05058134, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 2.4962273177387804, "language_loss": 0.88747549, "learning_rate": 3.999077133008022e-06, "loss": 0.91127336, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 2.533186674118042 }, { "auxiliary_loss_clip": 0.01318609, "auxiliary_loss_mlp": 0.01061055, "balance_loss_clip": 1.09328806, "balance_loss_mlp": 1.04616582, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 1.7605090985753582, "language_loss": 0.90584797, "learning_rate": 3.9990533199432145e-06, "loss": 0.92964464, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 3.349200487136841 }, { "auxiliary_loss_clip": 0.01316768, "auxiliary_loss_mlp": 0.01048766, "balance_loss_clip": 1.09302616, "balance_loss_mlp": 1.03388894, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 2.28323905735946, "language_loss": 0.75842416, "learning_rate": 3.999029203623608e-06, "loss": 0.78207958, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 2.5184061527252197 }, { "auxiliary_loss_clip": 0.01316288, "auxiliary_loss_mlp": 0.0106005, "balance_loss_clip": 1.093889, "balance_loss_mlp": 1.04597139, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 1.9373879238885332, "language_loss": 0.86733139, "learning_rate": 3.99900478405286e-06, "loss": 0.8910948, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 3.2127890586853027 }, { "auxiliary_loss_clip": 0.01318631, "auxiliary_loss_mlp": 0.01057384, "balance_loss_clip": 1.09571314, "balance_loss_mlp": 1.04310274, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 2.2008021697812667, "language_loss": 0.82537913, "learning_rate": 3.998980061234676e-06, "loss": 0.84913921, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 4.526222467422485 }, { "auxiliary_loss_clip": 0.0131322, "auxiliary_loss_mlp": 0.01053636, "balance_loss_clip": 1.09026265, "balance_loss_mlp": 1.0388546, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 2.5565098907307484, "language_loss": 0.7565695, "learning_rate": 3.9989550351728055e-06, "loss": 0.78023803, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 2.531641721725464 }, { "auxiliary_loss_clip": 0.01310734, "auxiliary_loss_mlp": 0.01049518, "balance_loss_clip": 1.09068918, "balance_loss_mlp": 1.0348376, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 2.596717113650031, "language_loss": 0.8464781, "learning_rate": 3.998929705871046e-06, "loss": 0.87008059, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.545999050140381 }, { "auxiliary_loss_clip": 0.01314636, "auxiliary_loss_mlp": 0.01053243, "balance_loss_clip": 1.09407616, "balance_loss_mlp": 1.03890789, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 2.356084721211621, "language_loss": 0.89058423, "learning_rate": 3.99890407333324e-06, "loss": 0.91426301, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.514615297317505 }, { "auxiliary_loss_clip": 0.01313341, "auxiliary_loss_mlp": 0.01065751, "balance_loss_clip": 1.09012496, "balance_loss_mlp": 1.05113578, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.6977392657812431, "language_loss": 0.86971194, "learning_rate": 3.998878137563275e-06, "loss": 0.89350283, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.5181312561035156 }, { "auxiliary_loss_clip": 0.01313754, "auxiliary_loss_mlp": 0.0105063, "balance_loss_clip": 1.09159148, "balance_loss_mlp": 1.03569269, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 2.2703484025276937, "language_loss": 0.85037673, "learning_rate": 3.998851898565085e-06, "loss": 0.87402058, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.528578281402588 }, { "auxiliary_loss_clip": 0.01311248, "auxiliary_loss_mlp": 0.01054638, "balance_loss_clip": 1.08934045, "balance_loss_mlp": 1.0408392, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 1.9780194479753836, "language_loss": 0.83019316, "learning_rate": 3.998825356342653e-06, "loss": 0.85385203, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.520033359527588 }, { "auxiliary_loss_clip": 0.01310582, "auxiliary_loss_mlp": 0.01055805, "balance_loss_clip": 1.08977628, "balance_loss_mlp": 1.04192293, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 2.1946952760656755, "language_loss": 0.7307989, "learning_rate": 3.998798510900003e-06, "loss": 0.75446284, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.6788558959960938 }, { "auxiliary_loss_clip": 0.01306239, "auxiliary_loss_mlp": 0.01062111, "balance_loss_clip": 1.08742023, "balance_loss_mlp": 1.04769886, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 2.2647429576224005, "language_loss": 0.83690643, "learning_rate": 3.998771362241207e-06, "loss": 0.86058992, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.5736894607543945 }, { "auxiliary_loss_clip": 0.01305508, "auxiliary_loss_mlp": 0.01062149, "balance_loss_clip": 1.08711779, "balance_loss_mlp": 1.04747462, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 1.8026651717589495, "language_loss": 0.8805961, "learning_rate": 3.998743910370385e-06, "loss": 0.90427274, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.5231807231903076 }, { "auxiliary_loss_clip": 0.01314185, "auxiliary_loss_mlp": 0.01071864, "balance_loss_clip": 1.09502923, "balance_loss_mlp": 1.05745149, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.005214437669453, "language_loss": 0.73053145, "learning_rate": 3.998716155291702e-06, "loss": 0.75439191, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.5463719367980957 }, { "auxiliary_loss_clip": 0.01314291, "auxiliary_loss_mlp": 0.01056984, "balance_loss_clip": 1.09403527, "balance_loss_mlp": 1.04298902, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 2.567936122791699, "language_loss": 0.90308261, "learning_rate": 3.998688097009366e-06, "loss": 0.92679548, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.5754597187042236 }, { "auxiliary_loss_clip": 0.01307098, "auxiliary_loss_mlp": 0.01062659, "balance_loss_clip": 1.08946478, "balance_loss_mlp": 1.04778171, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 2.075241516392257, "language_loss": 0.80355072, "learning_rate": 3.998659735527636e-06, "loss": 0.82724833, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.552063465118408 }, { "auxiliary_loss_clip": 0.013087, "auxiliary_loss_mlp": 0.01055991, "balance_loss_clip": 1.08985329, "balance_loss_mlp": 1.04239511, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 1.6717548354628131, "language_loss": 0.77905518, "learning_rate": 3.998631070850813e-06, "loss": 0.80270207, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.5350868701934814 }, { "auxiliary_loss_clip": 0.01308274, "auxiliary_loss_mlp": 0.01058827, "balance_loss_clip": 1.0912683, "balance_loss_mlp": 1.04557133, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 2.6215785120684765, "language_loss": 0.8351419, "learning_rate": 3.9986021029832455e-06, "loss": 0.85881293, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.5036513805389404 }, { "auxiliary_loss_clip": 0.01308856, "auxiliary_loss_mlp": 0.01051486, "balance_loss_clip": 1.08974135, "balance_loss_mlp": 1.03687668, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 3.4041727436159155, "language_loss": 0.91711819, "learning_rate": 3.9985728319293285e-06, "loss": 0.94072163, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.5365169048309326 }, { "auxiliary_loss_clip": 0.01307541, "auxiliary_loss_mlp": 0.01047657, "balance_loss_clip": 1.08506775, "balance_loss_mlp": 1.03229117, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 13.071677555497837, "language_loss": 0.85614735, "learning_rate": 3.998543257693501e-06, "loss": 0.87969935, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.501319169998169 }, { "auxiliary_loss_clip": 0.01310431, "auxiliary_loss_mlp": 0.01063926, "balance_loss_clip": 1.09192967, "balance_loss_mlp": 1.05043769, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 1.6547325953331997, "language_loss": 0.87745696, "learning_rate": 3.998513380280251e-06, "loss": 0.90120053, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.5608365535736084 }, { "auxiliary_loss_clip": 0.0131299, "auxiliary_loss_mlp": 0.01065638, "balance_loss_clip": 1.09318435, "balance_loss_mlp": 1.05107117, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 2.26618502574991, "language_loss": 0.94824517, "learning_rate": 3.99848319969411e-06, "loss": 0.97203147, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.504929780960083 }, { "auxiliary_loss_clip": 0.01312867, "auxiliary_loss_mlp": 0.0107688, "balance_loss_clip": 1.09304142, "balance_loss_mlp": 1.06300998, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 2.0352777124850583, "language_loss": 0.79412478, "learning_rate": 3.9984527159396564e-06, "loss": 0.81802225, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.5298609733581543 }, { "auxiliary_loss_clip": 0.01303921, "auxiliary_loss_mlp": 0.01057811, "balance_loss_clip": 1.0849191, "balance_loss_mlp": 1.04366136, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 2.067214404279356, "language_loss": 0.84328985, "learning_rate": 3.9984219290215154e-06, "loss": 0.86690724, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.55487322807312 }, { "auxiliary_loss_clip": 0.01305516, "auxiliary_loss_mlp": 0.01053343, "balance_loss_clip": 1.08811975, "balance_loss_mlp": 1.04074883, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 1.616194315758387, "language_loss": 0.89214641, "learning_rate": 3.998390838944356e-06, "loss": 0.91573501, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.5857582092285156 }, { "auxiliary_loss_clip": 0.01307244, "auxiliary_loss_mlp": 0.01060927, "balance_loss_clip": 1.08902609, "balance_loss_mlp": 1.04650307, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 2.409808096594608, "language_loss": 0.90238363, "learning_rate": 3.998359445712895e-06, "loss": 0.92606533, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.5512373447418213 }, { "auxiliary_loss_clip": 0.01305481, "auxiliary_loss_mlp": 0.01056678, "balance_loss_clip": 1.08575869, "balance_loss_mlp": 1.04201555, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 2.097075536437559, "language_loss": 0.81085157, "learning_rate": 3.9983277493318955e-06, "loss": 0.83447313, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 3.2719600200653076 }, { "auxiliary_loss_clip": 0.01308053, "auxiliary_loss_mlp": 0.01054099, "balance_loss_clip": 1.08785927, "balance_loss_mlp": 1.03860188, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 2.7928955387437213, "language_loss": 0.81115079, "learning_rate": 3.998295749806165e-06, "loss": 0.83477235, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 2.5935537815093994 }, { "auxiliary_loss_clip": 0.01307585, "auxiliary_loss_mlp": 0.01064414, "balance_loss_clip": 1.0898962, "balance_loss_mlp": 1.05015612, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 2.437499876955015, "language_loss": 0.83409923, "learning_rate": 3.998263447140558e-06, "loss": 0.8578192, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 3.286508321762085 }, { "auxiliary_loss_clip": 0.01301079, "auxiliary_loss_mlp": 0.01057778, "balance_loss_clip": 1.08446372, "balance_loss_mlp": 1.0431397, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 1.8402279960007148, "language_loss": 0.81688887, "learning_rate": 3.998230841339976e-06, "loss": 0.84047747, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 2.6848254203796387 }, { "auxiliary_loss_clip": 0.01304409, "auxiliary_loss_mlp": 0.01060019, "balance_loss_clip": 1.08799839, "balance_loss_mlp": 1.04742432, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.2890338487867936, "language_loss": 0.85276091, "learning_rate": 3.998197932409363e-06, "loss": 0.87640524, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 4.163724422454834 }, { "auxiliary_loss_clip": 0.01306656, "auxiliary_loss_mlp": 0.01056826, "balance_loss_clip": 1.09133101, "balance_loss_mlp": 1.04312336, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 2.8074454328020577, "language_loss": 0.8607778, "learning_rate": 3.9981647203537125e-06, "loss": 0.88441265, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.5552849769592285 }, { "auxiliary_loss_clip": 0.01306705, "auxiliary_loss_mlp": 0.01053239, "balance_loss_clip": 1.0887748, "balance_loss_mlp": 1.040025, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.2480959030467185, "language_loss": 0.95519155, "learning_rate": 3.998131205178063e-06, "loss": 0.978791, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.648998737335205 }, { "auxiliary_loss_clip": 0.01305118, "auxiliary_loss_mlp": 0.01055471, "balance_loss_clip": 1.08607912, "balance_loss_mlp": 1.04102266, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 2.4925768614398502, "language_loss": 0.76655912, "learning_rate": 3.998097386887498e-06, "loss": 0.79016501, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.5709354877471924 }, { "auxiliary_loss_clip": 0.01299339, "auxiliary_loss_mlp": 0.01058494, "balance_loss_clip": 1.08592558, "balance_loss_mlp": 1.04473782, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.732202133324105, "language_loss": 0.84658849, "learning_rate": 3.998063265487148e-06, "loss": 0.8701669, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.6171112060546875 }, { "auxiliary_loss_clip": 0.01302672, "auxiliary_loss_mlp": 0.01060728, "balance_loss_clip": 1.08827174, "balance_loss_mlp": 1.04729342, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 1.828003254274323, "language_loss": 0.80775303, "learning_rate": 3.99802884098219e-06, "loss": 0.83138704, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.5265486240386963 }, { "auxiliary_loss_clip": 0.01302109, "auxiliary_loss_mlp": 0.01048308, "balance_loss_clip": 1.08543992, "balance_loss_mlp": 1.03394306, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 2.166315302224309, "language_loss": 0.82275569, "learning_rate": 3.997994113377845e-06, "loss": 0.84625983, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.613940477371216 }, { "auxiliary_loss_clip": 0.01301036, "auxiliary_loss_mlp": 0.01057627, "balance_loss_clip": 1.08585835, "balance_loss_mlp": 1.04401958, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.107104815608086, "language_loss": 0.83271766, "learning_rate": 3.9979590826793815e-06, "loss": 0.85630423, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.5660226345062256 }, { "auxiliary_loss_clip": 0.01306218, "auxiliary_loss_mlp": 0.01051847, "balance_loss_clip": 1.09065711, "balance_loss_mlp": 1.0377264, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 2.4757305366761995, "language_loss": 0.81087899, "learning_rate": 3.997923748892113e-06, "loss": 0.8344596, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.5656020641326904 }, { "auxiliary_loss_clip": 0.0129909, "auxiliary_loss_mlp": 0.01057027, "balance_loss_clip": 1.08692169, "balance_loss_mlp": 1.04358602, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 1.7416551695515288, "language_loss": 0.88569885, "learning_rate": 3.9978881120214015e-06, "loss": 0.90926003, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.583942413330078 }, { "auxiliary_loss_clip": 0.01300208, "auxiliary_loss_mlp": 0.01058333, "balance_loss_clip": 1.0837276, "balance_loss_mlp": 1.04343176, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 1.8899362160155027, "language_loss": 0.79284316, "learning_rate": 3.997852172072652e-06, "loss": 0.81642854, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.5564613342285156 }, { "auxiliary_loss_clip": 0.01307351, "auxiliary_loss_mlp": 0.01059, "balance_loss_clip": 1.08910441, "balance_loss_mlp": 1.04481399, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 3.1066548777172884, "language_loss": 0.89237714, "learning_rate": 3.9978159290513155e-06, "loss": 0.91604066, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.5255751609802246 }, { "auxiliary_loss_clip": 0.01305853, "auxiliary_loss_mlp": 0.01050523, "balance_loss_clip": 1.08806777, "balance_loss_mlp": 1.0358839, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 2.4965919564950423, "language_loss": 0.80074084, "learning_rate": 3.997779382962892e-06, "loss": 0.82430458, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.60319447517395 }, { "auxiliary_loss_clip": 0.01299034, "auxiliary_loss_mlp": 0.01047547, "balance_loss_clip": 1.08420706, "balance_loss_mlp": 1.03434455, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 2.573831449450958, "language_loss": 0.7379204, "learning_rate": 3.997742533812924e-06, "loss": 0.76138628, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.5902652740478516 }, { "auxiliary_loss_clip": 0.01304295, "auxiliary_loss_mlp": 0.01056549, "balance_loss_clip": 1.08808279, "balance_loss_mlp": 1.0417912, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 3.3210445813353386, "language_loss": 0.92268425, "learning_rate": 3.997705381607001e-06, "loss": 0.94629264, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.5294086933135986 }, { "auxiliary_loss_clip": 0.01213533, "auxiliary_loss_mlp": 0.01043804, "balance_loss_clip": 1.07620752, "balance_loss_mlp": 1.03843975, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 0.9762370706554626, "language_loss": 0.6031006, "learning_rate": 3.997667926350761e-06, "loss": 0.62567395, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.0383386611938477 }, { "auxiliary_loss_clip": 0.01213675, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.0759573, "balance_loss_mlp": 1.02724099, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.897278684969327, "language_loss": 0.57758641, "learning_rate": 3.997630168049886e-06, "loss": 0.60004783, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.140150547027588 }, { "auxiliary_loss_clip": 0.01305004, "auxiliary_loss_mlp": 0.01050691, "balance_loss_clip": 1.08762658, "balance_loss_mlp": 1.03651106, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 2.1872675984514025, "language_loss": 0.7732361, "learning_rate": 3.997592106710101e-06, "loss": 0.7967931, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.5851426124572754 }, { "auxiliary_loss_clip": 0.01303004, "auxiliary_loss_mlp": 0.01053087, "balance_loss_clip": 1.08753943, "balance_loss_mlp": 1.0392884, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 2.62671355901087, "language_loss": 0.65748513, "learning_rate": 3.997553742337182e-06, "loss": 0.68104607, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.6352899074554443 }, { "auxiliary_loss_clip": 0.0130018, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.08636856, "balance_loss_mlp": 1.04073715, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 2.0924302645276045, "language_loss": 0.9140929, "learning_rate": 3.997515074936949e-06, "loss": 0.9376477, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.5796120166778564 }, { "auxiliary_loss_clip": 0.01303683, "auxiliary_loss_mlp": 0.01066653, "balance_loss_clip": 1.08977962, "balance_loss_mlp": 1.05145407, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 2.5431639632885843, "language_loss": 0.86978, "learning_rate": 3.997476104515268e-06, "loss": 0.8934834, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.5220797061920166 }, { "auxiliary_loss_clip": 0.01301228, "auxiliary_loss_mlp": 0.01054584, "balance_loss_clip": 1.08850706, "balance_loss_mlp": 1.04154849, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 5.8900836238518375, "language_loss": 0.77856112, "learning_rate": 3.9974368310780485e-06, "loss": 0.80211926, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.5133159160614014 }, { "auxiliary_loss_clip": 0.0130749, "auxiliary_loss_mlp": 0.01069462, "balance_loss_clip": 1.08843994, "balance_loss_mlp": 1.05482352, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 2.7837063261425756, "language_loss": 0.74422026, "learning_rate": 3.997397254631251e-06, "loss": 0.76798975, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 2.6074421405792236 }, { "auxiliary_loss_clip": 0.01201274, "auxiliary_loss_mlp": 0.01112401, "balance_loss_clip": 1.06633997, "balance_loss_mlp": 1.10746562, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.8679140422610949, "language_loss": 0.60062426, "learning_rate": 3.997357375180878e-06, "loss": 0.623761, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 3.8849434852600098 }, { "auxiliary_loss_clip": 0.01299759, "auxiliary_loss_mlp": 0.01047371, "balance_loss_clip": 1.08752191, "balance_loss_mlp": 1.03322077, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 2.0100369123222683, "language_loss": 0.75250077, "learning_rate": 3.997317192732979e-06, "loss": 0.77597213, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 2.5444180965423584 }, { "auxiliary_loss_clip": 0.01305749, "auxiliary_loss_mlp": 0.01057162, "balance_loss_clip": 1.09006119, "balance_loss_mlp": 1.0434525, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 1.9247383853394031, "language_loss": 0.8271963, "learning_rate": 3.99727670729365e-06, "loss": 0.85082537, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 3.246682643890381 }, { "auxiliary_loss_clip": 0.01305886, "auxiliary_loss_mlp": 0.01059306, "balance_loss_clip": 1.09291458, "balance_loss_mlp": 1.04671144, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 1.7594548493690716, "language_loss": 0.77771282, "learning_rate": 3.997235918869033e-06, "loss": 0.80136472, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 3.4974958896636963 }, { "auxiliary_loss_clip": 0.01300716, "auxiliary_loss_mlp": 0.01047652, "balance_loss_clip": 1.0876348, "balance_loss_mlp": 1.03357959, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 1.8510229169930243, "language_loss": 0.82472646, "learning_rate": 3.997194827465315e-06, "loss": 0.84821022, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 2.520608901977539 }, { "auxiliary_loss_clip": 0.01297269, "auxiliary_loss_mlp": 0.01054786, "balance_loss_clip": 1.08440113, "balance_loss_mlp": 1.04032588, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 3.342638381318503, "language_loss": 0.91039729, "learning_rate": 3.997153433088728e-06, "loss": 0.93391788, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.4972915649414062 }, { "auxiliary_loss_clip": 0.01301794, "auxiliary_loss_mlp": 0.01071441, "balance_loss_clip": 1.08865023, "balance_loss_mlp": 1.05676603, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 1.9588800846267977, "language_loss": 0.81182247, "learning_rate": 3.997111735745554e-06, "loss": 0.83555472, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.557467222213745 }, { "auxiliary_loss_clip": 0.01302379, "auxiliary_loss_mlp": 0.01052859, "balance_loss_clip": 1.08969259, "balance_loss_mlp": 1.0386374, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 1.8425311138282774, "language_loss": 0.82694167, "learning_rate": 3.997069735442118e-06, "loss": 0.85049409, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.540644884109497 }, { "auxiliary_loss_clip": 0.01297284, "auxiliary_loss_mlp": 0.01055303, "balance_loss_clip": 1.08510876, "balance_loss_mlp": 1.04182053, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.7323307251928022, "language_loss": 0.80197328, "learning_rate": 3.997027432184792e-06, "loss": 0.82549918, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.6028120517730713 }, { "auxiliary_loss_clip": 0.01299828, "auxiliary_loss_mlp": 0.01058999, "balance_loss_clip": 1.08555412, "balance_loss_mlp": 1.04425263, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 1.8378619195442716, "language_loss": 0.89240122, "learning_rate": 3.99698482597999e-06, "loss": 0.91598946, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.539053440093994 }, { "auxiliary_loss_clip": 0.01188977, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.05706573, "balance_loss_mlp": 1.02565598, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8892898289068121, "language_loss": 0.63951337, "learning_rate": 3.99694191683418e-06, "loss": 0.66170883, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.1284377574920654 }, { "auxiliary_loss_clip": 0.01303388, "auxiliary_loss_mlp": 0.01064254, "balance_loss_clip": 1.08962464, "balance_loss_mlp": 1.04895353, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 1.8553547007987088, "language_loss": 0.8194291, "learning_rate": 3.996898704753867e-06, "loss": 0.84310549, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.5147111415863037 }, { "auxiliary_loss_clip": 0.01300102, "auxiliary_loss_mlp": 0.01053776, "balance_loss_clip": 1.0856781, "balance_loss_mlp": 1.03821898, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 2.276158456191609, "language_loss": 0.87559038, "learning_rate": 3.996855189745609e-06, "loss": 0.89912921, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.519475221633911 }, { "auxiliary_loss_clip": 0.01297969, "auxiliary_loss_mlp": 0.01056434, "balance_loss_clip": 1.08481884, "balance_loss_mlp": 1.04203379, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 2.177305733168899, "language_loss": 0.924519, "learning_rate": 3.996811371816007e-06, "loss": 0.94806302, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.579786539077759 }, { "auxiliary_loss_clip": 0.0130009, "auxiliary_loss_mlp": 0.01055205, "balance_loss_clip": 1.08881855, "balance_loss_mlp": 1.04074466, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 1.8585215509662585, "language_loss": 0.7783581, "learning_rate": 3.996767250971707e-06, "loss": 0.80191106, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.6452834606170654 }, { "auxiliary_loss_clip": 0.01302419, "auxiliary_loss_mlp": 0.01061188, "balance_loss_clip": 1.089764, "balance_loss_mlp": 1.04797983, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 5.387316918295607, "language_loss": 0.86926305, "learning_rate": 3.996722827219403e-06, "loss": 0.89289916, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.6004762649536133 }, { "auxiliary_loss_clip": 0.01301981, "auxiliary_loss_mlp": 0.0105534, "balance_loss_clip": 1.08983195, "balance_loss_mlp": 1.04107022, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 2.371611049687816, "language_loss": 0.82242262, "learning_rate": 3.996678100565833e-06, "loss": 0.84599584, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.5355756282806396 }, { "auxiliary_loss_clip": 0.01294029, "auxiliary_loss_mlp": 0.01049261, "balance_loss_clip": 1.08367407, "balance_loss_mlp": 1.03505123, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.4935237954883798, "language_loss": 0.88365811, "learning_rate": 3.996633071017783e-06, "loss": 0.90709102, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 2.5252017974853516 }, { "auxiliary_loss_clip": 0.01298547, "auxiliary_loss_mlp": 0.01052037, "balance_loss_clip": 1.08563828, "balance_loss_mlp": 1.03833961, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.7784191773277582, "language_loss": 0.81459171, "learning_rate": 3.996587738582084e-06, "loss": 0.83809757, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.532257080078125 }, { "auxiliary_loss_clip": 0.01296374, "auxiliary_loss_mlp": 0.01057945, "balance_loss_clip": 1.08454335, "balance_loss_mlp": 1.0437119, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 2.767417968868868, "language_loss": 0.85933489, "learning_rate": 3.9965421032656115e-06, "loss": 0.88287807, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.52763295173645 }, { "auxiliary_loss_clip": 0.0129743, "auxiliary_loss_mlp": 0.01056337, "balance_loss_clip": 1.08491325, "balance_loss_mlp": 1.04209149, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 2.7395367186996404, "language_loss": 0.94259155, "learning_rate": 3.99649616507529e-06, "loss": 0.9661293, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.5382044315338135 }, { "auxiliary_loss_clip": 0.01183634, "auxiliary_loss_mlp": 0.01009718, "balance_loss_clip": 1.05383909, "balance_loss_mlp": 1.00516403, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.8892393253277029, "language_loss": 0.63120091, "learning_rate": 3.996449924018088e-06, "loss": 0.65313447, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.062020778656006 }, { "auxiliary_loss_clip": 0.01294488, "auxiliary_loss_mlp": 0.01051483, "balance_loss_clip": 1.08415258, "balance_loss_mlp": 1.03753591, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 2.384306696339672, "language_loss": 0.79400563, "learning_rate": 3.99640338010102e-06, "loss": 0.81746531, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.536983013153076 }, { "auxiliary_loss_clip": 0.01291825, "auxiliary_loss_mlp": 0.01054814, "balance_loss_clip": 1.08265543, "balance_loss_mlp": 1.04100919, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 3.327937200990285, "language_loss": 0.78373933, "learning_rate": 3.996356533331146e-06, "loss": 0.80720568, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.551147699356079 }, { "auxiliary_loss_clip": 0.01301255, "auxiliary_loss_mlp": 0.01053829, "balance_loss_clip": 1.08429801, "balance_loss_mlp": 1.03896403, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 2.9563094822042215, "language_loss": 0.61869043, "learning_rate": 3.996309383715573e-06, "loss": 0.64224136, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.528376340866089 }, { "auxiliary_loss_clip": 0.01295865, "auxiliary_loss_mlp": 0.0104439, "balance_loss_clip": 1.08421755, "balance_loss_mlp": 1.02926278, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 2.9877722518969616, "language_loss": 0.73846602, "learning_rate": 3.996261931261454e-06, "loss": 0.7618686, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.497581720352173 }, { "auxiliary_loss_clip": 0.01298022, "auxiliary_loss_mlp": 0.01052092, "balance_loss_clip": 1.08748698, "balance_loss_mlp": 1.03776312, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 1.5904793879279402, "language_loss": 0.86182833, "learning_rate": 3.996214175975987e-06, "loss": 0.88532948, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 2.5930793285369873 }, { "auxiliary_loss_clip": 0.01298568, "auxiliary_loss_mlp": 0.01058239, "balance_loss_clip": 1.08786249, "balance_loss_mlp": 1.04433918, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 2.1007401857695345, "language_loss": 0.79300892, "learning_rate": 3.996166117866417e-06, "loss": 0.81657702, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 3.4002935886383057 }, { "auxiliary_loss_clip": 0.01294389, "auxiliary_loss_mlp": 0.01043133, "balance_loss_clip": 1.08303809, "balance_loss_mlp": 1.02953076, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 1.9690163795562645, "language_loss": 0.86467993, "learning_rate": 3.996117756940035e-06, "loss": 0.88805509, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 3.2073020935058594 }, { "auxiliary_loss_clip": 0.01298872, "auxiliary_loss_mlp": 0.01050953, "balance_loss_clip": 1.08781314, "balance_loss_mlp": 1.03717244, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 1.9783378658777173, "language_loss": 0.97630811, "learning_rate": 3.996069093204175e-06, "loss": 0.9998064, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 2.5075764656066895 }, { "auxiliary_loss_clip": 0.01300983, "auxiliary_loss_mlp": 0.01044483, "balance_loss_clip": 1.08808351, "balance_loss_mlp": 1.03039205, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.592496252616047, "language_loss": 0.88169348, "learning_rate": 3.996020126666221e-06, "loss": 0.90514815, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 3.264265775680542 }, { "auxiliary_loss_clip": 0.01296108, "auxiliary_loss_mlp": 0.01051977, "balance_loss_clip": 1.0858047, "balance_loss_mlp": 1.0378623, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 4.377381791143581, "language_loss": 0.82002091, "learning_rate": 3.995970857333601e-06, "loss": 0.84350175, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 3.3601770401000977 }, { "auxiliary_loss_clip": 0.01296794, "auxiliary_loss_mlp": 0.0105326, "balance_loss_clip": 1.08526587, "balance_loss_mlp": 1.039819, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 2.2475093934647594, "language_loss": 0.79724431, "learning_rate": 3.995921285213789e-06, "loss": 0.82074487, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.57621693611145 }, { "auxiliary_loss_clip": 0.01295517, "auxiliary_loss_mlp": 0.01049913, "balance_loss_clip": 1.08663404, "balance_loss_mlp": 1.03616202, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.1102372743679094, "language_loss": 0.80552173, "learning_rate": 3.995871410314305e-06, "loss": 0.82897604, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.4838385581970215 }, { "auxiliary_loss_clip": 0.01167306, "auxiliary_loss_mlp": 0.01020829, "balance_loss_clip": 1.05264688, "balance_loss_mlp": 1.01629925, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9071841059237318, "language_loss": 0.59636384, "learning_rate": 3.995821232642714e-06, "loss": 0.61824512, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.1995089054107666 }, { "auxiliary_loss_clip": 0.01276868, "auxiliary_loss_mlp": 0.01065635, "balance_loss_clip": 1.08548546, "balance_loss_mlp": 1.05259371, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 2.1814105868005726, "language_loss": 0.82372421, "learning_rate": 3.995770752206629e-06, "loss": 0.84714925, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.6392664909362793 }, { "auxiliary_loss_clip": 0.01294189, "auxiliary_loss_mlp": 0.01047843, "balance_loss_clip": 1.08543086, "balance_loss_mlp": 1.03430068, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 1.99972775542304, "language_loss": 0.97208148, "learning_rate": 3.995719969013709e-06, "loss": 0.99550188, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.487362861633301 }, { "auxiliary_loss_clip": 0.01258053, "auxiliary_loss_mlp": 0.01050189, "balance_loss_clip": 1.08111763, "balance_loss_mlp": 1.03622365, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 2.468051138273594, "language_loss": 0.86171067, "learning_rate": 3.995668883071655e-06, "loss": 0.88479304, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.5523312091827393 }, { "auxiliary_loss_clip": 0.0129759, "auxiliary_loss_mlp": 0.01049753, "balance_loss_clip": 1.08567035, "balance_loss_mlp": 1.0362525, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.1454004437651313, "language_loss": 0.90749693, "learning_rate": 3.995617494388219e-06, "loss": 0.93097043, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.5103988647460938 }, { "auxiliary_loss_clip": 0.01255647, "auxiliary_loss_mlp": 0.01047747, "balance_loss_clip": 1.07592797, "balance_loss_mlp": 1.03388321, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 2.0217634439995242, "language_loss": 0.80180848, "learning_rate": 3.995565802971196e-06, "loss": 0.82484245, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.554809808731079 }, { "auxiliary_loss_clip": 0.01253219, "auxiliary_loss_mlp": 0.01053273, "balance_loss_clip": 1.07750309, "balance_loss_mlp": 1.04055333, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 1.8357476880734556, "language_loss": 0.67166471, "learning_rate": 3.995513808828427e-06, "loss": 0.69472969, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.6102421283721924 }, { "auxiliary_loss_clip": 0.01255398, "auxiliary_loss_mlp": 0.01054033, "balance_loss_clip": 1.07738996, "balance_loss_mlp": 1.04152226, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 2.1071532624206544, "language_loss": 0.76526892, "learning_rate": 3.9954615119678e-06, "loss": 0.78836322, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.6151645183563232 }, { "auxiliary_loss_clip": 0.0126732, "auxiliary_loss_mlp": 0.01051632, "balance_loss_clip": 1.07964826, "balance_loss_mlp": 1.03835785, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 2.359323369248374, "language_loss": 0.80617499, "learning_rate": 3.995408912397248e-06, "loss": 0.82936454, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.617138624191284 }, { "auxiliary_loss_clip": 0.01263592, "auxiliary_loss_mlp": 0.01051227, "balance_loss_clip": 1.08294737, "balance_loss_mlp": 1.03799462, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.6523715599048336, "language_loss": 0.93048763, "learning_rate": 3.99535601012475e-06, "loss": 0.95363581, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.555788278579712 }, { "auxiliary_loss_clip": 0.01239661, "auxiliary_loss_mlp": 0.00765854, "balance_loss_clip": 1.0765276, "balance_loss_mlp": 1.00021994, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.573369411613478, "language_loss": 0.75455809, "learning_rate": 3.995302805158333e-06, "loss": 0.77461326, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.6614022254943848 }, { "auxiliary_loss_clip": 0.01245788, "auxiliary_loss_mlp": 0.01049733, "balance_loss_clip": 1.07432151, "balance_loss_mlp": 1.03483212, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 1.9382619732740944, "language_loss": 0.83390868, "learning_rate": 3.9952492975060665e-06, "loss": 0.85686386, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.577453851699829 }, { "auxiliary_loss_clip": 0.01269587, "auxiliary_loss_mlp": 0.01052422, "balance_loss_clip": 1.0774678, "balance_loss_mlp": 1.03863537, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 3.360417968498193, "language_loss": 0.85334933, "learning_rate": 3.995195487176067e-06, "loss": 0.87656939, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.6565427780151367 }, { "auxiliary_loss_clip": 0.01289953, "auxiliary_loss_mlp": 0.01051526, "balance_loss_clip": 1.08221388, "balance_loss_mlp": 1.0379777, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 2.267059024345018, "language_loss": 0.8572619, "learning_rate": 3.995141374176499e-06, "loss": 0.88067663, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.5055947303771973 }, { "auxiliary_loss_clip": 0.01134975, "auxiliary_loss_mlp": 0.00756409, "balance_loss_clip": 1.0451827, "balance_loss_mlp": 1.00035572, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.87072985540237, "language_loss": 0.6306479, "learning_rate": 3.995086958515572e-06, "loss": 0.64956164, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.192450523376465 }, { "auxiliary_loss_clip": 0.01178492, "auxiliary_loss_mlp": 0.00756453, "balance_loss_clip": 1.05127549, "balance_loss_mlp": 1.00037003, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 0.861409078045119, "language_loss": 0.59941179, "learning_rate": 3.995032240201538e-06, "loss": 0.61876124, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.071185827255249 }, { "auxiliary_loss_clip": 0.01156768, "auxiliary_loss_mlp": 0.01005101, "balance_loss_clip": 1.04923344, "balance_loss_mlp": 1.00061917, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.9376186999624959, "language_loss": 0.63126254, "learning_rate": 3.9949772192427e-06, "loss": 0.65288126, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 2.881181478500366 }, { "auxiliary_loss_clip": 0.01254989, "auxiliary_loss_mlp": 0.01056224, "balance_loss_clip": 1.07559323, "balance_loss_mlp": 1.04291999, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 2.3182655093361073, "language_loss": 0.79703385, "learning_rate": 3.994921895647405e-06, "loss": 0.82014596, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 2.6137983798980713 }, { "auxiliary_loss_clip": 0.01174519, "auxiliary_loss_mlp": 0.0100441, "balance_loss_clip": 1.04849982, "balance_loss_mlp": 0.9999277, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.8386474568642345, "language_loss": 0.55351651, "learning_rate": 3.994866269424043e-06, "loss": 0.57530582, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 3.0588133335113525 }, { "auxiliary_loss_clip": 0.01200905, "auxiliary_loss_mlp": 0.01046583, "balance_loss_clip": 1.062482, "balance_loss_mlp": 1.03335643, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 2.522417044646948, "language_loss": 0.78459275, "learning_rate": 3.9948103405810545e-06, "loss": 0.80706763, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 3.4613616466522217 }, { "auxiliary_loss_clip": 0.01222064, "auxiliary_loss_mlp": 0.01047497, "balance_loss_clip": 1.07028818, "balance_loss_mlp": 1.03465199, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 2.085014167704342, "language_loss": 0.85912788, "learning_rate": 3.994754109126923e-06, "loss": 0.88182354, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 3.416391372680664 }, { "auxiliary_loss_clip": 0.01194591, "auxiliary_loss_mlp": 0.01044413, "balance_loss_clip": 1.06957769, "balance_loss_mlp": 1.03187799, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 1.7412550006792054, "language_loss": 0.93521237, "learning_rate": 3.994697575070181e-06, "loss": 0.95760238, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 2.6974403858184814 }, { "auxiliary_loss_clip": 0.01252336, "auxiliary_loss_mlp": 0.01053546, "balance_loss_clip": 1.07903302, "balance_loss_mlp": 1.04083276, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 1.7437034635853288, "language_loss": 0.91521209, "learning_rate": 3.994640738419402e-06, "loss": 0.93827087, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 3.303598165512085 }, { "auxiliary_loss_clip": 0.01271946, "auxiliary_loss_mlp": 0.01050872, "balance_loss_clip": 1.07973862, "balance_loss_mlp": 1.03834939, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 1.9401678109865004, "language_loss": 0.80728865, "learning_rate": 3.9945835991832075e-06, "loss": 0.83051687, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 3.4677796363830566 }, { "auxiliary_loss_clip": 0.01289536, "auxiliary_loss_mlp": 0.01061502, "balance_loss_clip": 1.08461857, "balance_loss_mlp": 1.04893744, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 2.128685379124622, "language_loss": 0.93042034, "learning_rate": 3.994526157370268e-06, "loss": 0.95393074, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.601649284362793 }, { "auxiliary_loss_clip": 0.01144662, "auxiliary_loss_mlp": 0.01006134, "balance_loss_clip": 1.03857684, "balance_loss_mlp": 1.00172281, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.8952910628839714, "language_loss": 0.59311569, "learning_rate": 3.994468412989296e-06, "loss": 0.61462361, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.3135902881622314 }, { "auxiliary_loss_clip": 0.01225044, "auxiliary_loss_mlp": 0.0106072, "balance_loss_clip": 1.07109904, "balance_loss_mlp": 1.04780972, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 1.8989996564038183, "language_loss": 0.92544341, "learning_rate": 3.994410366049052e-06, "loss": 0.94830108, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.706190347671509 }, { "auxiliary_loss_clip": 0.01271602, "auxiliary_loss_mlp": 0.01050672, "balance_loss_clip": 1.08151484, "balance_loss_mlp": 1.03729713, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 3.4508367864686607, "language_loss": 0.83190441, "learning_rate": 3.994352016558341e-06, "loss": 0.85512722, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.5963544845581055 }, { "auxiliary_loss_clip": 0.01267008, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.08181465, "balance_loss_mlp": 1.03077924, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 1.8030842004347758, "language_loss": 0.73726535, "learning_rate": 3.994293364526014e-06, "loss": 0.76037025, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.635636568069458 }, { "auxiliary_loss_clip": 0.01244395, "auxiliary_loss_mlp": 0.01059504, "balance_loss_clip": 1.07720017, "balance_loss_mlp": 1.04525292, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 1.8900497194750072, "language_loss": 0.849015, "learning_rate": 3.99423440996097e-06, "loss": 0.87205398, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.613539457321167 }, { "auxiliary_loss_clip": 0.01260712, "auxiliary_loss_mlp": 0.01049994, "balance_loss_clip": 1.08255184, "balance_loss_mlp": 1.03690481, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 2.8864332708279488, "language_loss": 0.81618953, "learning_rate": 3.994175152872152e-06, "loss": 0.83929658, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.5638628005981445 }, { "auxiliary_loss_clip": 0.01272591, "auxiliary_loss_mlp": 0.01049621, "balance_loss_clip": 1.08070183, "balance_loss_mlp": 1.03755105, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 2.571150385035351, "language_loss": 0.78572059, "learning_rate": 3.994115593268548e-06, "loss": 0.80894279, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.602689743041992 }, { "auxiliary_loss_clip": 0.01287673, "auxiliary_loss_mlp": 0.01052261, "balance_loss_clip": 1.08456528, "balance_loss_mlp": 1.03927898, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 3.1059486198903636, "language_loss": 0.8194207, "learning_rate": 3.994055731159195e-06, "loss": 0.84281999, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.5432629585266113 }, { "auxiliary_loss_clip": 0.01277357, "auxiliary_loss_mlp": 0.01056202, "balance_loss_clip": 1.08640933, "balance_loss_mlp": 1.04434037, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 1.8270769778445632, "language_loss": 0.86827898, "learning_rate": 3.993995566553172e-06, "loss": 0.89161456, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.571065902709961 }, { "auxiliary_loss_clip": 0.01231399, "auxiliary_loss_mlp": 0.01054468, "balance_loss_clip": 1.07058597, "balance_loss_mlp": 1.04112864, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 1.7350831621146896, "language_loss": 0.77089834, "learning_rate": 3.993935099459607e-06, "loss": 0.79375702, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.6469762325286865 }, { "auxiliary_loss_clip": 0.01281178, "auxiliary_loss_mlp": 0.01049769, "balance_loss_clip": 1.08079302, "balance_loss_mlp": 1.038903, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 1.9774020252577473, "language_loss": 0.73769426, "learning_rate": 3.993874329887673e-06, "loss": 0.76100379, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.528331756591797 }, { "auxiliary_loss_clip": 0.01268189, "auxiliary_loss_mlp": 0.0104603, "balance_loss_clip": 1.08129859, "balance_loss_mlp": 1.03326893, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 2.848805563495819, "language_loss": 0.86948776, "learning_rate": 3.993813257846589e-06, "loss": 0.89262998, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.5223114490509033 }, { "auxiliary_loss_clip": 0.01271749, "auxiliary_loss_mlp": 0.01059343, "balance_loss_clip": 1.08173156, "balance_loss_mlp": 1.04540706, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.4571028221530606, "language_loss": 0.92884243, "learning_rate": 3.993751883345619e-06, "loss": 0.95215338, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.569429874420166 }, { "auxiliary_loss_clip": 0.01253113, "auxiliary_loss_mlp": 0.0105317, "balance_loss_clip": 1.07999086, "balance_loss_mlp": 1.03958035, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 2.9615395213201423, "language_loss": 0.87885469, "learning_rate": 3.993690206394073e-06, "loss": 0.90191758, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.6793081760406494 }, { "auxiliary_loss_clip": 0.01259241, "auxiliary_loss_mlp": 0.01053685, "balance_loss_clip": 1.07823539, "balance_loss_mlp": 1.04088151, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 2.3829721169322804, "language_loss": 0.87610936, "learning_rate": 3.993628227001307e-06, "loss": 0.89923859, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.600213050842285 }, { "auxiliary_loss_clip": 0.01253476, "auxiliary_loss_mlp": 0.01060696, "balance_loss_clip": 1.07685494, "balance_loss_mlp": 1.04775548, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 1.86371478785799, "language_loss": 0.71201313, "learning_rate": 3.993565945176726e-06, "loss": 0.73515487, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 2.8394179344177246 }, { "auxiliary_loss_clip": 0.01252527, "auxiliary_loss_mlp": 0.01053686, "balance_loss_clip": 1.07886767, "balance_loss_mlp": 1.04117525, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 1.8961694396020916, "language_loss": 0.84159821, "learning_rate": 3.993503360929776e-06, "loss": 0.86466032, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.623880386352539 }, { "auxiliary_loss_clip": 0.01186274, "auxiliary_loss_mlp": 0.01055995, "balance_loss_clip": 1.06665397, "balance_loss_mlp": 1.04293013, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.6163516234840345, "language_loss": 0.80881095, "learning_rate": 3.99344047426995e-06, "loss": 0.83123362, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 2.973371744155884 }, { "auxiliary_loss_clip": 0.01223668, "auxiliary_loss_mlp": 0.01055138, "balance_loss_clip": 1.07347775, "balance_loss_mlp": 1.04133368, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 2.422910552736015, "language_loss": 0.93424994, "learning_rate": 3.993377285206789e-06, "loss": 0.95703799, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 2.9041428565979004 }, { "auxiliary_loss_clip": 0.01208098, "auxiliary_loss_mlp": 0.01054909, "balance_loss_clip": 1.06985867, "balance_loss_mlp": 1.04205275, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 1.6399430347489103, "language_loss": 0.86271828, "learning_rate": 3.99331379374988e-06, "loss": 0.88534838, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 2.806896448135376 }, { "auxiliary_loss_clip": 0.01250109, "auxiliary_loss_mlp": 0.01047104, "balance_loss_clip": 1.07105684, "balance_loss_mlp": 1.03405058, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 1.9169020249850914, "language_loss": 0.79927778, "learning_rate": 3.993249999908852e-06, "loss": 0.82224989, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 3.909759759902954 }, { "auxiliary_loss_clip": 0.0128572, "auxiliary_loss_mlp": 0.01056352, "balance_loss_clip": 1.08106542, "balance_loss_mlp": 1.04258919, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 2.0178269687371406, "language_loss": 0.86910272, "learning_rate": 3.993185903693384e-06, "loss": 0.89252341, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 3.3313541412353516 }, { "auxiliary_loss_clip": 0.01256365, "auxiliary_loss_mlp": 0.01044911, "balance_loss_clip": 1.07848072, "balance_loss_mlp": 1.03282356, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 1.9623062631651915, "language_loss": 0.82131141, "learning_rate": 3.9931215051131995e-06, "loss": 0.84432411, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 2.6627554893493652 }, { "auxiliary_loss_clip": 0.01252951, "auxiliary_loss_mlp": 0.01055579, "balance_loss_clip": 1.07330322, "balance_loss_mlp": 1.04343176, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 1.7975228642570558, "language_loss": 0.80013227, "learning_rate": 3.993056804178068e-06, "loss": 0.82321751, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 3.447030544281006 }, { "auxiliary_loss_clip": 0.0121318, "auxiliary_loss_mlp": 0.01045566, "balance_loss_clip": 1.07312107, "balance_loss_mlp": 1.03231001, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 2.0867259080534852, "language_loss": 0.84481251, "learning_rate": 3.992991800897803e-06, "loss": 0.86739999, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 3.5320496559143066 }, { "auxiliary_loss_clip": 0.0128294, "auxiliary_loss_mlp": 0.01051739, "balance_loss_clip": 1.08213663, "balance_loss_mlp": 1.03907943, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 2.153599102477539, "language_loss": 0.90160614, "learning_rate": 3.9929264952822665e-06, "loss": 0.92495286, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.5776610374450684 }, { "auxiliary_loss_clip": 0.01272353, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.07913721, "balance_loss_mlp": 1.03077495, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 1.820271584543583, "language_loss": 0.8828811, "learning_rate": 3.992860887341366e-06, "loss": 0.90604019, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.6077022552490234 }, { "auxiliary_loss_clip": 0.01217005, "auxiliary_loss_mlp": 0.01049196, "balance_loss_clip": 1.07021856, "balance_loss_mlp": 1.03680468, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.306178896194831, "language_loss": 0.81092757, "learning_rate": 3.992794977085052e-06, "loss": 0.83358961, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.6807377338409424 }, { "auxiliary_loss_clip": 0.01231583, "auxiliary_loss_mlp": 0.01053009, "balance_loss_clip": 1.07422876, "balance_loss_mlp": 1.04012871, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 2.463538662185252, "language_loss": 0.84968269, "learning_rate": 3.992728764523326e-06, "loss": 0.87252855, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.665590524673462 }, { "auxiliary_loss_clip": 0.01252376, "auxiliary_loss_mlp": 0.01049901, "balance_loss_clip": 1.08221054, "balance_loss_mlp": 1.03685331, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 1.6155816969014418, "language_loss": 0.80784452, "learning_rate": 3.99266224966623e-06, "loss": 0.83086729, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.683539628982544 }, { "auxiliary_loss_clip": 0.01238851, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.07476354, "balance_loss_mlp": 1.03061771, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 2.257288582091297, "language_loss": 0.88032812, "learning_rate": 3.992595432523855e-06, "loss": 0.90313542, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.641402244567871 }, { "auxiliary_loss_clip": 0.01224688, "auxiliary_loss_mlp": 0.01045207, "balance_loss_clip": 1.07480681, "balance_loss_mlp": 1.03353691, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 1.8079599410593228, "language_loss": 0.85897762, "learning_rate": 3.992528313106338e-06, "loss": 0.88167655, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.6415812969207764 }, { "auxiliary_loss_clip": 0.01286143, "auxiliary_loss_mlp": 0.00765343, "balance_loss_clip": 1.08535147, "balance_loss_mlp": 1.0001328, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 2.1730951562135177, "language_loss": 0.81838316, "learning_rate": 3.9924608914238595e-06, "loss": 0.83889794, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.565941095352173 }, { "auxiliary_loss_clip": 0.01270613, "auxiliary_loss_mlp": 0.01044465, "balance_loss_clip": 1.08378506, "balance_loss_mlp": 1.03157282, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 2.493490821766446, "language_loss": 0.83596319, "learning_rate": 3.992393167486648e-06, "loss": 0.85911393, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.664119005203247 }, { "auxiliary_loss_clip": 0.01284622, "auxiliary_loss_mlp": 0.01049945, "balance_loss_clip": 1.08247864, "balance_loss_mlp": 1.03691554, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 2.253327412820963, "language_loss": 0.80934286, "learning_rate": 3.992325141304977e-06, "loss": 0.83268845, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.580497980117798 }, { "auxiliary_loss_clip": 0.0122116, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.0743897, "balance_loss_mlp": 1.03332829, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 2.1486149398180414, "language_loss": 0.86641967, "learning_rate": 3.992256812889166e-06, "loss": 0.88908249, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.711813449859619 }, { "auxiliary_loss_clip": 0.01285824, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.08711183, "balance_loss_mlp": 1.03077853, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 2.3166712591256178, "language_loss": 0.76771188, "learning_rate": 3.992188182249582e-06, "loss": 0.7910111, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.701622724533081 }, { "auxiliary_loss_clip": 0.01250598, "auxiliary_loss_mlp": 0.01045489, "balance_loss_clip": 1.08040357, "balance_loss_mlp": 1.03230453, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 1.9997645238857515, "language_loss": 0.90440452, "learning_rate": 3.992119249396633e-06, "loss": 0.92736536, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.627744674682617 }, { "auxiliary_loss_clip": 0.01244504, "auxiliary_loss_mlp": 0.00765191, "balance_loss_clip": 1.07536936, "balance_loss_mlp": 1.00013268, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 1.7838257933945507, "language_loss": 0.82092017, "learning_rate": 3.992050014340778e-06, "loss": 0.84101713, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.7037198543548584 }, { "auxiliary_loss_clip": 0.01151865, "auxiliary_loss_mlp": 0.01019331, "balance_loss_clip": 1.04844546, "balance_loss_mlp": 1.01556361, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 1.0389851911997585, "language_loss": 0.55077559, "learning_rate": 3.99198047709252e-06, "loss": 0.57248759, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.2599875926971436 }, { "auxiliary_loss_clip": 0.01225517, "auxiliary_loss_mlp": 0.01044346, "balance_loss_clip": 1.06961679, "balance_loss_mlp": 1.03118563, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 2.0398306006536586, "language_loss": 0.78891838, "learning_rate": 3.991910637662408e-06, "loss": 0.81161702, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.6791133880615234 }, { "auxiliary_loss_clip": 0.01277702, "auxiliary_loss_mlp": 0.0104815, "balance_loss_clip": 1.08081758, "balance_loss_mlp": 1.03579962, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 2.9022775182528546, "language_loss": 0.80757415, "learning_rate": 3.9918404960610355e-06, "loss": 0.83083266, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.589202404022217 }, { "auxiliary_loss_clip": 0.01271507, "auxiliary_loss_mlp": 0.01064802, "balance_loss_clip": 1.0812248, "balance_loss_mlp": 1.0522548, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.3474990242863836, "language_loss": 0.77201498, "learning_rate": 3.991770052299043e-06, "loss": 0.79537809, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.6040334701538086 }, { "auxiliary_loss_clip": 0.012504, "auxiliary_loss_mlp": 0.01044498, "balance_loss_clip": 1.07479393, "balance_loss_mlp": 1.03234434, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.366093495335668, "language_loss": 0.87785608, "learning_rate": 3.991699306387118e-06, "loss": 0.900805, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.639477491378784 }, { "auxiliary_loss_clip": 0.01264085, "auxiliary_loss_mlp": 0.01056878, "balance_loss_clip": 1.07866752, "balance_loss_mlp": 1.04350877, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.8776906326022489, "language_loss": 0.78029037, "learning_rate": 3.991628258335991e-06, "loss": 0.80349994, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 2.771372079849243 }, { "auxiliary_loss_clip": 0.01222634, "auxiliary_loss_mlp": 0.01048229, "balance_loss_clip": 1.07086694, "balance_loss_mlp": 1.03460288, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 3.196519336640929, "language_loss": 0.87219238, "learning_rate": 3.991556908156442e-06, "loss": 0.89490104, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 2.753873586654663 }, { "auxiliary_loss_clip": 0.0125008, "auxiliary_loss_mlp": 0.01049136, "balance_loss_clip": 1.07383764, "balance_loss_mlp": 1.03635049, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 1.8187273938789579, "language_loss": 0.87655455, "learning_rate": 3.9914852558592914e-06, "loss": 0.89954674, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 3.490030288696289 }, { "auxiliary_loss_clip": 0.01262742, "auxiliary_loss_mlp": 0.01042054, "balance_loss_clip": 1.07959008, "balance_loss_mlp": 1.0294838, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 3.0969166512267554, "language_loss": 0.80824935, "learning_rate": 3.991413301455413e-06, "loss": 0.83129728, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 3.335170030593872 }, { "auxiliary_loss_clip": 0.01235497, "auxiliary_loss_mlp": 0.01054452, "balance_loss_clip": 1.07361495, "balance_loss_mlp": 1.04227471, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 2.240678389649679, "language_loss": 0.77789956, "learning_rate": 3.991341044955719e-06, "loss": 0.80079901, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 3.4928698539733887 }, { "auxiliary_loss_clip": 0.0126155, "auxiliary_loss_mlp": 0.00765787, "balance_loss_clip": 1.07562983, "balance_loss_mlp": 1.00016189, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 1.9682611208958198, "language_loss": 0.81622517, "learning_rate": 3.991268486371172e-06, "loss": 0.8364985, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 2.8060574531555176 }, { "auxiliary_loss_clip": 0.01244284, "auxiliary_loss_mlp": 0.01059061, "balance_loss_clip": 1.07197094, "balance_loss_mlp": 1.0460794, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 2.141381900601277, "language_loss": 0.88035762, "learning_rate": 3.991195625712779e-06, "loss": 0.90339112, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 3.451523780822754 }, { "auxiliary_loss_clip": 0.01279524, "auxiliary_loss_mlp": 0.01037099, "balance_loss_clip": 1.08127785, "balance_loss_mlp": 1.02448702, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 2.4487033730797525, "language_loss": 0.81763554, "learning_rate": 3.991122462991592e-06, "loss": 0.84080178, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.55501127243042 }, { "auxiliary_loss_clip": 0.01279631, "auxiliary_loss_mlp": 0.01053818, "balance_loss_clip": 1.07846665, "balance_loss_mlp": 1.03994179, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 4.224674981410079, "language_loss": 0.81611991, "learning_rate": 3.991048998218712e-06, "loss": 0.83945441, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.5507798194885254 }, { "auxiliary_loss_clip": 0.01262976, "auxiliary_loss_mlp": 0.01042799, "balance_loss_clip": 1.07591891, "balance_loss_mlp": 1.02976382, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 2.270025212133049, "language_loss": 0.7691642, "learning_rate": 3.990975231405281e-06, "loss": 0.7922219, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.825798749923706 }, { "auxiliary_loss_clip": 0.01262594, "auxiliary_loss_mlp": 0.01057792, "balance_loss_clip": 1.07764196, "balance_loss_mlp": 1.04494143, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 1.8822811824409889, "language_loss": 0.78998637, "learning_rate": 3.990901162562491e-06, "loss": 0.81319028, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.8355653285980225 }, { "auxiliary_loss_clip": 0.01227636, "auxiliary_loss_mlp": 0.00765813, "balance_loss_clip": 1.06904292, "balance_loss_mlp": 1.00011921, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 1.9067899420400232, "language_loss": 0.90687156, "learning_rate": 3.9908267917015765e-06, "loss": 0.92680609, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.834641695022583 }, { "auxiliary_loss_clip": 0.01254186, "auxiliary_loss_mlp": 0.01049893, "balance_loss_clip": 1.07632351, "balance_loss_mlp": 1.03648162, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 1.9529658582542784, "language_loss": 0.93222392, "learning_rate": 3.990752118833821e-06, "loss": 0.95526475, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.652101755142212 }, { "auxiliary_loss_clip": 0.01278935, "auxiliary_loss_mlp": 0.01047184, "balance_loss_clip": 1.08081293, "balance_loss_mlp": 1.03498316, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 1.7711190029344144, "language_loss": 0.77715874, "learning_rate": 3.990677143970553e-06, "loss": 0.80041993, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.663672924041748 }, { "auxiliary_loss_clip": 0.01229819, "auxiliary_loss_mlp": 0.01049363, "balance_loss_clip": 1.07697511, "balance_loss_mlp": 1.0370245, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 2.364353016712349, "language_loss": 0.81232512, "learning_rate": 3.990601867123144e-06, "loss": 0.83511698, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.663703680038452 }, { "auxiliary_loss_clip": 0.01211839, "auxiliary_loss_mlp": 0.01053164, "balance_loss_clip": 1.06825471, "balance_loss_mlp": 1.0413506, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.931374988443663, "language_loss": 0.85082281, "learning_rate": 3.990526288303014e-06, "loss": 0.87347287, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.7065019607543945 }, { "auxiliary_loss_clip": 0.01246428, "auxiliary_loss_mlp": 0.00764705, "balance_loss_clip": 1.07767248, "balance_loss_mlp": 1.0001061, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 2.555422856451407, "language_loss": 0.90650702, "learning_rate": 3.9904504075216295e-06, "loss": 0.92661834, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 2.6695640087127686 }, { "auxiliary_loss_clip": 0.01224168, "auxiliary_loss_mlp": 0.01047045, "balance_loss_clip": 1.06664872, "balance_loss_mlp": 1.03402758, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 2.3387759037397773, "language_loss": 0.93729424, "learning_rate": 3.990374224790501e-06, "loss": 0.9600063, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.6540727615356445 }, { "auxiliary_loss_clip": 0.01248186, "auxiliary_loss_mlp": 0.01044985, "balance_loss_clip": 1.07831144, "balance_loss_mlp": 1.03227139, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 2.3223539321526174, "language_loss": 0.70795619, "learning_rate": 3.990297740121185e-06, "loss": 0.73088789, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.6196703910827637 }, { "auxiliary_loss_clip": 0.01264024, "auxiliary_loss_mlp": 0.00765414, "balance_loss_clip": 1.07805657, "balance_loss_mlp": 1.00009596, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 1.909524278749976, "language_loss": 0.77913809, "learning_rate": 3.990220953525284e-06, "loss": 0.7994324, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.6030960083007812 }, { "auxiliary_loss_clip": 0.01234511, "auxiliary_loss_mlp": 0.010468, "balance_loss_clip": 1.07193315, "balance_loss_mlp": 1.03350866, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 2.4236272251043163, "language_loss": 0.74305475, "learning_rate": 3.9901438650144465e-06, "loss": 0.76586783, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.602219581604004 }, { "auxiliary_loss_clip": 0.01255432, "auxiliary_loss_mlp": 0.0104689, "balance_loss_clip": 1.0771842, "balance_loss_mlp": 1.03490341, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 3.0150272096919, "language_loss": 0.92135012, "learning_rate": 3.990066474600367e-06, "loss": 0.94437337, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.5963540077209473 }, { "auxiliary_loss_clip": 0.01250544, "auxiliary_loss_mlp": 0.01045916, "balance_loss_clip": 1.07336879, "balance_loss_mlp": 1.0338347, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 1.9702131881666787, "language_loss": 0.68245375, "learning_rate": 3.989988782294786e-06, "loss": 0.70541835, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.5814578533172607 }, { "auxiliary_loss_clip": 0.01213388, "auxiliary_loss_mlp": 0.01044135, "balance_loss_clip": 1.06772304, "balance_loss_mlp": 1.03258967, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 1.6843598113591054, "language_loss": 0.95091623, "learning_rate": 3.989910788109489e-06, "loss": 0.97349143, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.646916627883911 }, { "auxiliary_loss_clip": 0.01223241, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.07105041, "balance_loss_mlp": 1.03419542, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 2.324021716955585, "language_loss": 0.74746776, "learning_rate": 3.989832492056307e-06, "loss": 0.77015817, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.7751195430755615 }, { "auxiliary_loss_clip": 0.0126113, "auxiliary_loss_mlp": 0.01065266, "balance_loss_clip": 1.07912171, "balance_loss_mlp": 1.05252266, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 2.11969783564211, "language_loss": 0.8090592, "learning_rate": 3.989753894147119e-06, "loss": 0.83232313, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.631540536880493 }, { "auxiliary_loss_clip": 0.01259764, "auxiliary_loss_mlp": 0.01048297, "balance_loss_clip": 1.0827086, "balance_loss_mlp": 1.03825998, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 2.127522715318158, "language_loss": 0.80073464, "learning_rate": 3.989674994393846e-06, "loss": 0.82381523, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 2.633462905883789 }, { "auxiliary_loss_clip": 0.01259983, "auxiliary_loss_mlp": 0.0104612, "balance_loss_clip": 1.07956398, "balance_loss_mlp": 1.0342406, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 2.0540382646430686, "language_loss": 0.93818754, "learning_rate": 3.98959579280846e-06, "loss": 0.96124864, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 2.6894302368164062 }, { "auxiliary_loss_clip": 0.01190529, "auxiliary_loss_mlp": 0.01057076, "balance_loss_clip": 1.06907701, "balance_loss_mlp": 1.0452323, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.043367070532318, "language_loss": 0.82960051, "learning_rate": 3.989516289402973e-06, "loss": 0.85207653, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 3.5100958347320557 }, { "auxiliary_loss_clip": 0.01171159, "auxiliary_loss_mlp": 0.01043948, "balance_loss_clip": 1.059955, "balance_loss_mlp": 1.03236687, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 2.3344179504444633, "language_loss": 0.80111456, "learning_rate": 3.989436484189447e-06, "loss": 0.82326567, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 3.447075843811035 }, { "auxiliary_loss_clip": 0.01260435, "auxiliary_loss_mlp": 0.01047756, "balance_loss_clip": 1.074422, "balance_loss_mlp": 1.0357275, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 2.6921143612041765, "language_loss": 0.8103888, "learning_rate": 3.9893563771799885e-06, "loss": 0.8334707, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 3.3282310962677 }, { "auxiliary_loss_clip": 0.0127667, "auxiliary_loss_mlp": 0.01050917, "balance_loss_clip": 1.07835281, "balance_loss_mlp": 1.03896046, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.189404071044197, "language_loss": 0.86477757, "learning_rate": 3.989275968386749e-06, "loss": 0.88805342, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 2.606621742248535 }, { "auxiliary_loss_clip": 0.01235541, "auxiliary_loss_mlp": 0.01047229, "balance_loss_clip": 1.06994545, "balance_loss_mlp": 1.03583241, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 1.9546431551730092, "language_loss": 0.76878774, "learning_rate": 3.989195257821926e-06, "loss": 0.79161537, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 3.4888803958892822 }, { "auxiliary_loss_clip": 0.0124169, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.07802033, "balance_loss_mlp": 1.04061019, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 2.026318371043068, "language_loss": 0.84411228, "learning_rate": 3.989114245497765e-06, "loss": 0.86705661, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.64931583404541 }, { "auxiliary_loss_clip": 0.01261106, "auxiliary_loss_mlp": 0.01043902, "balance_loss_clip": 1.07438207, "balance_loss_mlp": 1.03343558, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.125786540813677, "language_loss": 0.95091236, "learning_rate": 3.989032931426554e-06, "loss": 0.97396243, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.5838935375213623 }, { "auxiliary_loss_clip": 0.01244845, "auxiliary_loss_mlp": 0.01043473, "balance_loss_clip": 1.07580841, "balance_loss_mlp": 1.03225553, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 2.42934819584401, "language_loss": 0.86949825, "learning_rate": 3.9889513156206295e-06, "loss": 0.89238149, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.652130365371704 }, { "auxiliary_loss_clip": 0.01231963, "auxiliary_loss_mlp": 0.01058614, "balance_loss_clip": 1.07343757, "balance_loss_mlp": 1.04541779, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 2.8371954327795255, "language_loss": 0.73773623, "learning_rate": 3.988869398092371e-06, "loss": 0.76064205, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.6562561988830566 }, { "auxiliary_loss_clip": 0.01240809, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.07411027, "balance_loss_mlp": 1.03139758, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.628429363213342, "language_loss": 0.78717422, "learning_rate": 3.988787178854206e-06, "loss": 0.81002134, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.68205189704895 }, { "auxiliary_loss_clip": 0.01278509, "auxiliary_loss_mlp": 0.01048228, "balance_loss_clip": 1.08134794, "balance_loss_mlp": 1.03672409, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 2.2042800810250136, "language_loss": 0.87602359, "learning_rate": 3.988704657918608e-06, "loss": 0.89929098, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.616750717163086 }, { "auxiliary_loss_clip": 0.01264149, "auxiliary_loss_mlp": 0.0104335, "balance_loss_clip": 1.08097339, "balance_loss_mlp": 1.03157771, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 3.0882908112088288, "language_loss": 0.79932868, "learning_rate": 3.988621835298094e-06, "loss": 0.82240367, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.6330788135528564 }, { "auxiliary_loss_clip": 0.01273342, "auxiliary_loss_mlp": 0.01051038, "balance_loss_clip": 1.08046305, "balance_loss_mlp": 1.03970098, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 2.162556529675408, "language_loss": 0.9157061, "learning_rate": 3.988538711005229e-06, "loss": 0.93894994, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.6286098957061768 }, { "auxiliary_loss_clip": 0.01251471, "auxiliary_loss_mlp": 0.01047489, "balance_loss_clip": 1.07588553, "balance_loss_mlp": 1.03624809, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 2.087883055641979, "language_loss": 0.88036543, "learning_rate": 3.988455285052622e-06, "loss": 0.903355, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.630317449569702 }, { "auxiliary_loss_clip": 0.01256617, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.07741177, "balance_loss_mlp": 1.02954292, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.169447297624444, "language_loss": 0.8366589, "learning_rate": 3.98837155745293e-06, "loss": 0.85964519, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.5888521671295166 }, { "auxiliary_loss_clip": 0.01258339, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 1.07819629, "balance_loss_mlp": 1.03028297, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 2.5797051560958226, "language_loss": 0.75784987, "learning_rate": 3.988287528218854e-06, "loss": 0.7808553, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.609233856201172 }, { "auxiliary_loss_clip": 0.0125655, "auxiliary_loss_mlp": 0.01054859, "balance_loss_clip": 1.07893288, "balance_loss_mlp": 1.0440284, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 2.1687781662981367, "language_loss": 0.90363562, "learning_rate": 3.98820319736314e-06, "loss": 0.92674971, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.5724587440490723 }, { "auxiliary_loss_clip": 0.01229638, "auxiliary_loss_mlp": 0.01041206, "balance_loss_clip": 1.07270598, "balance_loss_mlp": 1.02949929, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 1.907999202349445, "language_loss": 0.85219598, "learning_rate": 3.988118564898582e-06, "loss": 0.87490445, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.691847085952759 }, { "auxiliary_loss_clip": 0.01219407, "auxiliary_loss_mlp": 0.00764728, "balance_loss_clip": 1.07325852, "balance_loss_mlp": 1.00017428, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.200254877225638, "language_loss": 0.89075184, "learning_rate": 3.988033630838019e-06, "loss": 0.91059315, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.6956379413604736 }, { "auxiliary_loss_clip": 0.01261038, "auxiliary_loss_mlp": 0.01057138, "balance_loss_clip": 1.07915831, "balance_loss_mlp": 1.04488337, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 1.9362046985089154, "language_loss": 0.88025516, "learning_rate": 3.987948395194334e-06, "loss": 0.9034369, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.668919563293457 }, { "auxiliary_loss_clip": 0.01253883, "auxiliary_loss_mlp": 0.01046992, "balance_loss_clip": 1.07495499, "balance_loss_mlp": 1.03581595, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 2.0093428933547646, "language_loss": 0.76737905, "learning_rate": 3.987862857980458e-06, "loss": 0.79038787, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.6454381942749023 }, { "auxiliary_loss_clip": 0.01231488, "auxiliary_loss_mlp": 0.01041555, "balance_loss_clip": 1.07368457, "balance_loss_mlp": 1.02990234, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 2.5432495496211383, "language_loss": 0.76837718, "learning_rate": 3.987777019209368e-06, "loss": 0.79110765, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.7288451194763184 }, { "auxiliary_loss_clip": 0.01272522, "auxiliary_loss_mlp": 0.01059962, "balance_loss_clip": 1.07847691, "balance_loss_mlp": 1.04931068, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 1.7103509691446848, "language_loss": 0.80824757, "learning_rate": 3.987690878894084e-06, "loss": 0.83157241, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.615147113800049 }, { "auxiliary_loss_clip": 0.01244894, "auxiliary_loss_mlp": 0.01044388, "balance_loss_clip": 1.07421041, "balance_loss_mlp": 1.03303981, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 2.247405817676521, "language_loss": 0.84957033, "learning_rate": 3.987604437047673e-06, "loss": 0.87246317, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.6302084922790527 }, { "auxiliary_loss_clip": 0.01252186, "auxiliary_loss_mlp": 0.01056235, "balance_loss_clip": 1.0735687, "balance_loss_mlp": 1.04409933, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.028803833334278, "language_loss": 0.77318633, "learning_rate": 3.987517693683251e-06, "loss": 0.79627049, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 2.6171834468841553 }, { "auxiliary_loss_clip": 0.01235869, "auxiliary_loss_mlp": 0.01055873, "balance_loss_clip": 1.07624435, "balance_loss_mlp": 1.04348755, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 2.5694532576729787, "language_loss": 0.95938122, "learning_rate": 3.9874306488139745e-06, "loss": 0.98229861, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 3.455869197845459 }, { "auxiliary_loss_clip": 0.01223202, "auxiliary_loss_mlp": 0.0104354, "balance_loss_clip": 1.07426751, "balance_loss_mlp": 1.03230453, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 1.9402330626601905, "language_loss": 0.87993586, "learning_rate": 3.987343302453049e-06, "loss": 0.90260327, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 2.7129294872283936 }, { "auxiliary_loss_clip": 0.01235147, "auxiliary_loss_mlp": 0.01043343, "balance_loss_clip": 1.07344604, "balance_loss_mlp": 1.03191686, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.6105975875237397, "language_loss": 0.82369244, "learning_rate": 3.987255654613724e-06, "loss": 0.84647739, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 3.4826035499572754 }, { "auxiliary_loss_clip": 0.01219884, "auxiliary_loss_mlp": 0.01047173, "balance_loss_clip": 1.06847107, "balance_loss_mlp": 1.03591418, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 3.0762599994689683, "language_loss": 0.70741105, "learning_rate": 3.987167705309296e-06, "loss": 0.73008168, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 2.676022529602051 }, { "auxiliary_loss_clip": 0.01252561, "auxiliary_loss_mlp": 0.00763634, "balance_loss_clip": 1.07284391, "balance_loss_mlp": 1.0001626, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 2.113963994377332, "language_loss": 0.95046639, "learning_rate": 3.987079454553108e-06, "loss": 0.97062832, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 3.4150912761688232 }, { "auxiliary_loss_clip": 0.0122367, "auxiliary_loss_mlp": 0.01051054, "balance_loss_clip": 1.07699931, "balance_loss_mlp": 1.03880548, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 1.8116096917957822, "language_loss": 0.91254866, "learning_rate": 3.986990902358546e-06, "loss": 0.93529588, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 2.6770262718200684 }, { "auxiliary_loss_clip": 0.0125858, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.07661843, "balance_loss_mlp": 1.03601193, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 1.992415342854652, "language_loss": 0.9335835, "learning_rate": 3.986902048739045e-06, "loss": 0.9566406, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 3.4635348320007324 }, { "auxiliary_loss_clip": 0.01236475, "auxiliary_loss_mlp": 0.01046858, "balance_loss_clip": 1.07139945, "balance_loss_mlp": 1.03407884, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 2.7779404446210365, "language_loss": 0.79984528, "learning_rate": 3.986812893708082e-06, "loss": 0.82267857, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.614450454711914 }, { "auxiliary_loss_clip": 0.01241546, "auxiliary_loss_mlp": 0.01040111, "balance_loss_clip": 1.07030034, "balance_loss_mlp": 1.0275228, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 5.6602268639259625, "language_loss": 0.81279171, "learning_rate": 3.9867234372791826e-06, "loss": 0.83560836, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.6608729362487793 }, { "auxiliary_loss_clip": 0.01251632, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.07362247, "balance_loss_mlp": 1.03331137, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 1.5139172224092556, "language_loss": 0.87150651, "learning_rate": 3.986633679465918e-06, "loss": 0.89447308, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.596078872680664 }, { "auxiliary_loss_clip": 0.01213132, "auxiliary_loss_mlp": 0.01048154, "balance_loss_clip": 1.07187581, "balance_loss_mlp": 1.03727019, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 2.4132777934797045, "language_loss": 0.80538988, "learning_rate": 3.986543620281904e-06, "loss": 0.82800269, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.6817259788513184 }, { "auxiliary_loss_clip": 0.01217636, "auxiliary_loss_mlp": 0.01043277, "balance_loss_clip": 1.06776524, "balance_loss_mlp": 1.03266716, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 1.6736101607819514, "language_loss": 0.91200387, "learning_rate": 3.986453259740802e-06, "loss": 0.93461299, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.6795802116394043 }, { "auxiliary_loss_clip": 0.01242286, "auxiliary_loss_mlp": 0.0105535, "balance_loss_clip": 1.07764482, "balance_loss_mlp": 1.04350662, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 3.2093966990138516, "language_loss": 0.78900301, "learning_rate": 3.986362597856319e-06, "loss": 0.81197941, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.6329336166381836 }, { "auxiliary_loss_clip": 0.0123304, "auxiliary_loss_mlp": 0.00765553, "balance_loss_clip": 1.0702951, "balance_loss_mlp": 1.00016415, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 2.6527589830274714, "language_loss": 0.82140541, "learning_rate": 3.986271634642211e-06, "loss": 0.84139132, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.6207046508789062 }, { "auxiliary_loss_clip": 0.01271643, "auxiliary_loss_mlp": 0.01045794, "balance_loss_clip": 1.07981014, "balance_loss_mlp": 1.03420675, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 2.1735218353436054, "language_loss": 0.81802225, "learning_rate": 3.986180370112274e-06, "loss": 0.84119666, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.536914587020874 }, { "auxiliary_loss_clip": 0.01255075, "auxiliary_loss_mlp": 0.00765468, "balance_loss_clip": 1.07515502, "balance_loss_mlp": 1.00020313, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 1.7439477321207166, "language_loss": 0.74575949, "learning_rate": 3.986088804280354e-06, "loss": 0.76596498, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.631284475326538 }, { "auxiliary_loss_clip": 0.01238389, "auxiliary_loss_mlp": 0.01042547, "balance_loss_clip": 1.07316065, "balance_loss_mlp": 1.03142476, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.6278381746747557, "language_loss": 0.94006652, "learning_rate": 3.985996937160342e-06, "loss": 0.96287584, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 2.636082172393799 }, { "auxiliary_loss_clip": 0.01254282, "auxiliary_loss_mlp": 0.01043477, "balance_loss_clip": 1.08012795, "balance_loss_mlp": 1.03277183, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 1.9934659938001698, "language_loss": 0.68913209, "learning_rate": 3.985904768766173e-06, "loss": 0.71210963, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 2.831266403198242 }, { "auxiliary_loss_clip": 0.01225772, "auxiliary_loss_mlp": 0.01053713, "balance_loss_clip": 1.0713582, "balance_loss_mlp": 1.04082084, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 2.296471053138094, "language_loss": 0.76338327, "learning_rate": 3.98581229911183e-06, "loss": 0.78617817, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.6244442462921143 }, { "auxiliary_loss_clip": 0.0124989, "auxiliary_loss_mlp": 0.010459, "balance_loss_clip": 1.07046759, "balance_loss_mlp": 1.03424096, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 1.8626137658438633, "language_loss": 0.92026615, "learning_rate": 3.985719528211341e-06, "loss": 0.94322407, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.5586485862731934 }, { "auxiliary_loss_clip": 0.01148461, "auxiliary_loss_mlp": 0.01007209, "balance_loss_clip": 1.04742885, "balance_loss_mlp": 1.00272715, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8434689346951716, "language_loss": 0.63035369, "learning_rate": 3.985626456078777e-06, "loss": 0.65191042, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.2917227745056152 }, { "auxiliary_loss_clip": 0.01223988, "auxiliary_loss_mlp": 0.01064637, "balance_loss_clip": 1.07326961, "balance_loss_mlp": 1.05341959, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 2.094885742819683, "language_loss": 0.86666155, "learning_rate": 3.985533082728259e-06, "loss": 0.88954777, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.6066524982452393 }, { "auxiliary_loss_clip": 0.01265259, "auxiliary_loss_mlp": 0.01046213, "balance_loss_clip": 1.07319772, "balance_loss_mlp": 1.03360105, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 2.3755254895534, "language_loss": 0.74948728, "learning_rate": 3.985439408173951e-06, "loss": 0.77260208, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.5739145278930664 }, { "auxiliary_loss_clip": 0.01270558, "auxiliary_loss_mlp": 0.01053039, "balance_loss_clip": 1.07823896, "balance_loss_mlp": 1.04112399, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 1.8919298430118658, "language_loss": 0.70740288, "learning_rate": 3.9853454324300634e-06, "loss": 0.73063886, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.533182382583618 }, { "auxiliary_loss_clip": 0.01193327, "auxiliary_loss_mlp": 0.0103685, "balance_loss_clip": 1.06375563, "balance_loss_mlp": 1.02568042, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 2.006314212278964, "language_loss": 0.78276628, "learning_rate": 3.985251155510852e-06, "loss": 0.80506802, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.691563606262207 }, { "auxiliary_loss_clip": 0.01204275, "auxiliary_loss_mlp": 0.01047113, "balance_loss_clip": 1.06972229, "balance_loss_mlp": 1.03574049, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 1.863915646526964, "language_loss": 0.80191898, "learning_rate": 3.98515657743062e-06, "loss": 0.82443291, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 2.700944423675537 }, { "auxiliary_loss_clip": 0.012355, "auxiliary_loss_mlp": 0.01040289, "balance_loss_clip": 1.0714339, "balance_loss_mlp": 1.02878547, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 2.05161011808784, "language_loss": 0.77685714, "learning_rate": 3.985061698203711e-06, "loss": 0.79961509, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 3.3761446475982666 }, { "auxiliary_loss_clip": 0.0116595, "auxiliary_loss_mlp": 0.01005224, "balance_loss_clip": 1.04803491, "balance_loss_mlp": 1.00083721, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 0.8835995455287483, "language_loss": 0.63859272, "learning_rate": 3.984966517844523e-06, "loss": 0.66030455, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 3.094402551651001 }, { "auxiliary_loss_clip": 0.01268775, "auxiliary_loss_mlp": 0.01052581, "balance_loss_clip": 1.0763849, "balance_loss_mlp": 1.03927147, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 2.346608830005219, "language_loss": 0.80287027, "learning_rate": 3.984871036367492e-06, "loss": 0.8260839, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 3.350151300430298 }, { "auxiliary_loss_clip": 0.01253555, "auxiliary_loss_mlp": 0.00764731, "balance_loss_clip": 1.07732797, "balance_loss_mlp": 1.0002383, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 1.8372645751282337, "language_loss": 0.8299458, "learning_rate": 3.984775253787102e-06, "loss": 0.85012859, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 2.581946611404419 }, { "auxiliary_loss_clip": 0.01254616, "auxiliary_loss_mlp": 0.01049241, "balance_loss_clip": 1.07276189, "balance_loss_mlp": 1.03718305, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 2.761085261091875, "language_loss": 0.87530982, "learning_rate": 3.984679170117885e-06, "loss": 0.89834845, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 3.3261606693267822 }, { "auxiliary_loss_clip": 0.01248153, "auxiliary_loss_mlp": 0.01042395, "balance_loss_clip": 1.07280684, "balance_loss_mlp": 1.03056371, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 2.382805183823801, "language_loss": 0.78667068, "learning_rate": 3.984582785374415e-06, "loss": 0.80957621, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 2.527209520339966 }, { "auxiliary_loss_clip": 0.01239002, "auxiliary_loss_mlp": 0.00764958, "balance_loss_clip": 1.0745995, "balance_loss_mlp": 1.0001905, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 1.8555510296166642, "language_loss": 0.80676079, "learning_rate": 3.9844860995713155e-06, "loss": 0.82680035, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 3.444244146347046 }, { "auxiliary_loss_clip": 0.01252332, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.07786918, "balance_loss_mlp": 1.03080094, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 3.6142838903423478, "language_loss": 0.8301686, "learning_rate": 3.9843891127232524e-06, "loss": 0.85311478, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.507911205291748 }, { "auxiliary_loss_clip": 0.01195819, "auxiliary_loss_mlp": 0.01047849, "balance_loss_clip": 1.06633854, "balance_loss_mlp": 1.0361253, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.8282058937793004, "language_loss": 0.66688108, "learning_rate": 3.984291824844938e-06, "loss": 0.6893177, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.650733709335327 }, { "auxiliary_loss_clip": 0.01266473, "auxiliary_loss_mlp": 0.01042681, "balance_loss_clip": 1.07532477, "balance_loss_mlp": 1.02953243, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 2.355002766446531, "language_loss": 0.85251236, "learning_rate": 3.984194235951132e-06, "loss": 0.8756038, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.5389151573181152 }, { "auxiliary_loss_clip": 0.01271923, "auxiliary_loss_mlp": 0.01054624, "balance_loss_clip": 1.0799644, "balance_loss_mlp": 1.0432272, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 4.43937720080714, "language_loss": 0.8419416, "learning_rate": 3.9840963460566375e-06, "loss": 0.86520708, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.6067237854003906 }, { "auxiliary_loss_clip": 0.01169661, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.06198335, "balance_loss_mlp": 1.02562571, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 3.311179596209405, "language_loss": 0.894072, "learning_rate": 3.983998155176305e-06, "loss": 0.91614133, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.763029098510742 }, { "auxiliary_loss_clip": 0.01160508, "auxiliary_loss_mlp": 0.010049, "balance_loss_clip": 1.04397118, "balance_loss_mlp": 1.00072801, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.830623703357473, "language_loss": 0.57063115, "learning_rate": 3.9838996633250305e-06, "loss": 0.59228522, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.122347116470337 }, { "auxiliary_loss_clip": 0.01249599, "auxiliary_loss_mlp": 0.01044346, "balance_loss_clip": 1.07066917, "balance_loss_mlp": 1.0330683, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.3058349790249943, "language_loss": 0.88155687, "learning_rate": 3.983800870517753e-06, "loss": 0.90449637, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.5862770080566406 }, { "auxiliary_loss_clip": 0.01254505, "auxiliary_loss_mlp": 0.01044456, "balance_loss_clip": 1.08156228, "balance_loss_mlp": 1.03395391, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 3.824997725836329, "language_loss": 0.77813506, "learning_rate": 3.983701776769463e-06, "loss": 0.80112469, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.5845932960510254 }, { "auxiliary_loss_clip": 0.01249057, "auxiliary_loss_mlp": 0.01044742, "balance_loss_clip": 1.07783639, "balance_loss_mlp": 1.03361964, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 1.8842813170030117, "language_loss": 0.85491693, "learning_rate": 3.9836023820951885e-06, "loss": 0.87785494, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.6369168758392334 }, { "auxiliary_loss_clip": 0.01209793, "auxiliary_loss_mlp": 0.01039378, "balance_loss_clip": 1.06545925, "balance_loss_mlp": 1.02808857, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 2.209766851651428, "language_loss": 0.68487215, "learning_rate": 3.983502686510011e-06, "loss": 0.70736384, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.6180615425109863 }, { "auxiliary_loss_clip": 0.01252036, "auxiliary_loss_mlp": 0.00764583, "balance_loss_clip": 1.07233715, "balance_loss_mlp": 1.00018048, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 2.0430110505292576, "language_loss": 0.73393524, "learning_rate": 3.9834026900290525e-06, "loss": 0.7541014, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 2.574289321899414 }, { "auxiliary_loss_clip": 0.01271397, "auxiliary_loss_mlp": 0.01053371, "balance_loss_clip": 1.07903743, "balance_loss_mlp": 1.04242194, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 1.8425962360295622, "language_loss": 1.00423014, "learning_rate": 3.983302392667482e-06, "loss": 1.02747786, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.5741097927093506 }, { "auxiliary_loss_clip": 0.01249497, "auxiliary_loss_mlp": 0.01039405, "balance_loss_clip": 1.07566059, "balance_loss_mlp": 1.0282824, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 1.697367303578273, "language_loss": 0.93736303, "learning_rate": 3.983201794440517e-06, "loss": 0.96025205, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.582667827606201 }, { "auxiliary_loss_clip": 0.0122792, "auxiliary_loss_mlp": 0.01048745, "balance_loss_clip": 1.07451367, "balance_loss_mlp": 1.03746176, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 1.7939622328596636, "language_loss": 0.67671353, "learning_rate": 3.9831008953634165e-06, "loss": 0.69948018, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.5641040802001953 }, { "auxiliary_loss_clip": 0.01190205, "auxiliary_loss_mlp": 0.01037188, "balance_loss_clip": 1.06516588, "balance_loss_mlp": 1.02477789, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 2.045445422766, "language_loss": 0.81114799, "learning_rate": 3.9829996954514864e-06, "loss": 0.83342195, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.710773229598999 }, { "auxiliary_loss_clip": 0.01240791, "auxiliary_loss_mlp": 0.0104533, "balance_loss_clip": 1.07287335, "balance_loss_mlp": 1.03384423, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 1.8412923705667965, "language_loss": 0.84121746, "learning_rate": 3.982898194720079e-06, "loss": 0.8640787, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.6194872856140137 }, { "auxiliary_loss_clip": 0.01231159, "auxiliary_loss_mlp": 0.00764576, "balance_loss_clip": 1.07606363, "balance_loss_mlp": 1.00019836, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 2.1615024574675985, "language_loss": 0.82324845, "learning_rate": 3.982796393184592e-06, "loss": 0.84320581, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.6578071117401123 }, { "auxiliary_loss_clip": 0.01149022, "auxiliary_loss_mlp": 0.01007042, "balance_loss_clip": 1.04371333, "balance_loss_mlp": 1.00282216, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.7955068865099313, "language_loss": 0.62634337, "learning_rate": 3.98269429086047e-06, "loss": 0.64790404, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 3.0219156742095947 }, { "auxiliary_loss_clip": 0.01215297, "auxiliary_loss_mlp": 0.01046663, "balance_loss_clip": 1.06680799, "balance_loss_mlp": 1.03340697, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 3.0462474769013586, "language_loss": 0.86191773, "learning_rate": 3.982591887763199e-06, "loss": 0.88453728, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 2.5864641666412354 }, { "auxiliary_loss_clip": 0.01198788, "auxiliary_loss_mlp": 0.01044468, "balance_loss_clip": 1.06164217, "balance_loss_mlp": 1.03285122, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 2.1272604177637096, "language_loss": 0.81982666, "learning_rate": 3.982489183908316e-06, "loss": 0.84225923, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 3.448108196258545 }, { "auxiliary_loss_clip": 0.01158801, "auxiliary_loss_mlp": 0.01042222, "balance_loss_clip": 1.05436742, "balance_loss_mlp": 1.03102195, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 1.8030931528634409, "language_loss": 0.8456409, "learning_rate": 3.982386179311399e-06, "loss": 0.86765116, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 2.7079856395721436 }, { "auxiliary_loss_clip": 0.01255686, "auxiliary_loss_mlp": 0.01050817, "balance_loss_clip": 1.0767467, "balance_loss_mlp": 1.03776991, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 35.92314251916883, "language_loss": 0.87409043, "learning_rate": 3.982282873988075e-06, "loss": 0.8971554, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 3.286536693572998 }, { "auxiliary_loss_clip": 0.01235046, "auxiliary_loss_mlp": 0.01047763, "balance_loss_clip": 1.07458925, "balance_loss_mlp": 1.03703368, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.643829576791502, "language_loss": 0.86781389, "learning_rate": 3.982179267954016e-06, "loss": 0.89064199, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 2.6109697818756104 }, { "auxiliary_loss_clip": 0.01266146, "auxiliary_loss_mlp": 0.01039401, "balance_loss_clip": 1.07574534, "balance_loss_mlp": 1.02668691, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.299075894595495, "language_loss": 0.95922935, "learning_rate": 3.982075361224937e-06, "loss": 0.98228478, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 3.2897706031799316 }, { "auxiliary_loss_clip": 0.01246527, "auxiliary_loss_mlp": 0.00764198, "balance_loss_clip": 1.07465398, "balance_loss_mlp": 1.0001483, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 1.7191436384403052, "language_loss": 0.87933683, "learning_rate": 3.981971153816602e-06, "loss": 0.8994441, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.5463991165161133 }, { "auxiliary_loss_clip": 0.01269739, "auxiliary_loss_mlp": 0.01041305, "balance_loss_clip": 1.08211076, "balance_loss_mlp": 1.03065419, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 1.7731046260872771, "language_loss": 0.9622277, "learning_rate": 3.981866645744819e-06, "loss": 0.98533809, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 3.378173351287842 }, { "auxiliary_loss_clip": 0.01270366, "auxiliary_loss_mlp": 0.00764827, "balance_loss_clip": 1.07944894, "balance_loss_mlp": 1.0001328, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 2.2172828328293384, "language_loss": 0.81395912, "learning_rate": 3.9817618370254416e-06, "loss": 0.83431107, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.5280351638793945 }, { "auxiliary_loss_clip": 0.01267403, "auxiliary_loss_mlp": 0.01051578, "balance_loss_clip": 1.07789242, "balance_loss_mlp": 1.03990781, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.4443547115725464, "language_loss": 0.87597692, "learning_rate": 3.9816567276743684e-06, "loss": 0.8991667, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.6008851528167725 }, { "auxiliary_loss_clip": 0.0123323, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.07458985, "balance_loss_mlp": 1.03496051, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 1.9011773027737742, "language_loss": 0.77859026, "learning_rate": 3.9815513177075466e-06, "loss": 0.80138826, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.5959866046905518 }, { "auxiliary_loss_clip": 0.0124648, "auxiliary_loss_mlp": 0.01051306, "balance_loss_clip": 1.07554352, "balance_loss_mlp": 1.04066658, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.9484513951244535, "language_loss": 0.70091581, "learning_rate": 3.9814456071409646e-06, "loss": 0.72389364, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.6005210876464844 }, { "auxiliary_loss_clip": 0.01205268, "auxiliary_loss_mlp": 0.01043478, "balance_loss_clip": 1.06790924, "balance_loss_mlp": 1.03014469, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 4.6983648096680986, "language_loss": 0.85177201, "learning_rate": 3.981339595990659e-06, "loss": 0.87425947, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.6676185131073 }, { "auxiliary_loss_clip": 0.01248107, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.07472694, "balance_loss_mlp": 1.02660966, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 2.1407915145963456, "language_loss": 0.80976975, "learning_rate": 3.981233284272713e-06, "loss": 0.83264029, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.5622141361236572 }, { "auxiliary_loss_clip": 0.01215528, "auxiliary_loss_mlp": 0.01060634, "balance_loss_clip": 1.06933403, "balance_loss_mlp": 1.04940403, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 1.5844699914677463, "language_loss": 0.90049022, "learning_rate": 3.981126672003253e-06, "loss": 0.92325181, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.7046027183532715 }, { "auxiliary_loss_clip": 0.01233605, "auxiliary_loss_mlp": 0.01045759, "balance_loss_clip": 1.06696832, "balance_loss_mlp": 1.03337932, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 3.3874725745970116, "language_loss": 0.78695571, "learning_rate": 3.981019759198451e-06, "loss": 0.80974936, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.620738983154297 }, { "auxiliary_loss_clip": 0.01238267, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.07208848, "balance_loss_mlp": 1.02843976, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.8468756392004404, "language_loss": 0.84112298, "learning_rate": 3.980912545874528e-06, "loss": 0.86389929, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.610797166824341 }, { "auxiliary_loss_clip": 0.01240819, "auxiliary_loss_mlp": 0.00764837, "balance_loss_clip": 1.07046115, "balance_loss_mlp": 1.00014758, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 1.8721481056543563, "language_loss": 0.85317045, "learning_rate": 3.980805032047746e-06, "loss": 0.87322706, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.6141562461853027 }, { "auxiliary_loss_clip": 0.01233356, "auxiliary_loss_mlp": 0.01041257, "balance_loss_clip": 1.07255697, "balance_loss_mlp": 1.02853107, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 1.977607385466361, "language_loss": 0.80752528, "learning_rate": 3.980697217734415e-06, "loss": 0.83027136, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.549487590789795 }, { "auxiliary_loss_clip": 0.0121091, "auxiliary_loss_mlp": 0.00764146, "balance_loss_clip": 1.07085013, "balance_loss_mlp": 1.00014973, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 1.7908156720305866, "language_loss": 0.91592026, "learning_rate": 3.980589102950891e-06, "loss": 0.93567079, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.640376567840576 }, { "auxiliary_loss_clip": 0.01232004, "auxiliary_loss_mlp": 0.01041755, "balance_loss_clip": 1.07379341, "balance_loss_mlp": 1.02962565, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.6317970254696452, "language_loss": 0.76063287, "learning_rate": 3.9804806877135755e-06, "loss": 0.78337044, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.6500983238220215 }, { "auxiliary_loss_clip": 0.0125121, "auxiliary_loss_mlp": 0.00764646, "balance_loss_clip": 1.07265425, "balance_loss_mlp": 1.00012732, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 2.021657155323173, "language_loss": 0.86146146, "learning_rate": 3.980371972038915e-06, "loss": 0.88162005, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.6194887161254883 }, { "auxiliary_loss_clip": 0.01270407, "auxiliary_loss_mlp": 0.01042944, "balance_loss_clip": 1.08243108, "balance_loss_mlp": 1.03104722, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 1.8635347826838835, "language_loss": 0.84682035, "learning_rate": 3.980262955943399e-06, "loss": 0.86995387, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.5182180404663086 }, { "auxiliary_loss_clip": 0.01226572, "auxiliary_loss_mlp": 0.01040451, "balance_loss_clip": 1.073084, "balance_loss_mlp": 1.02895904, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.441455376178146, "language_loss": 0.86436659, "learning_rate": 3.980153639443569e-06, "loss": 0.8870368, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.5855839252471924 }, { "auxiliary_loss_clip": 0.0124472, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.07335734, "balance_loss_mlp": 1.0258671, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 2.131494234672017, "language_loss": 0.79873705, "learning_rate": 3.980044022556005e-06, "loss": 0.82155466, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.6133341789245605 }, { "auxiliary_loss_clip": 0.01250266, "auxiliary_loss_mlp": 0.01045482, "balance_loss_clip": 1.07451773, "balance_loss_mlp": 1.03346539, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.1177768348798733, "language_loss": 0.72913438, "learning_rate": 3.9799341052973375e-06, "loss": 0.75209183, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 2.6194546222686768 }, { "auxiliary_loss_clip": 0.01237858, "auxiliary_loss_mlp": 0.0105344, "balance_loss_clip": 1.07642102, "balance_loss_mlp": 1.04119086, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 2.77562861425666, "language_loss": 0.75261366, "learning_rate": 3.979823887684241e-06, "loss": 0.77552664, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 2.5674898624420166 }, { "auxiliary_loss_clip": 0.01269917, "auxiliary_loss_mlp": 0.01047473, "balance_loss_clip": 1.07961082, "balance_loss_mlp": 1.03577816, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 2.915259375920163, "language_loss": 0.84845644, "learning_rate": 3.979713369733434e-06, "loss": 0.87163031, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 3.3737385272979736 }, { "auxiliary_loss_clip": 0.01245038, "auxiliary_loss_mlp": 0.01040357, "balance_loss_clip": 1.07423544, "balance_loss_mlp": 1.02841854, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 1.948726568914937, "language_loss": 0.84904587, "learning_rate": 3.979602551461683e-06, "loss": 0.87189984, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 2.574411630630493 }, { "auxiliary_loss_clip": 0.01235467, "auxiliary_loss_mlp": 0.01043067, "balance_loss_clip": 1.07641602, "balance_loss_mlp": 1.03108668, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 2.406596779872574, "language_loss": 0.91457409, "learning_rate": 3.979491432885799e-06, "loss": 0.93735945, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 3.3724093437194824 }, { "auxiliary_loss_clip": 0.01197828, "auxiliary_loss_mlp": 0.00763843, "balance_loss_clip": 1.06642413, "balance_loss_mlp": 1.00006378, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 1.943455562548442, "language_loss": 0.82691789, "learning_rate": 3.97938001402264e-06, "loss": 0.84653455, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 2.6533150672912598 }, { "auxiliary_loss_clip": 0.01212155, "auxiliary_loss_mlp": 0.0104591, "balance_loss_clip": 1.06935608, "balance_loss_mlp": 1.03367352, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 2.6310430443024995, "language_loss": 0.79943204, "learning_rate": 3.979268294889105e-06, "loss": 0.82201272, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 3.3003945350646973 }, { "auxiliary_loss_clip": 0.0126786, "auxiliary_loss_mlp": 0.01044713, "balance_loss_clip": 1.07977343, "balance_loss_mlp": 1.03311968, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 1.6832701069790912, "language_loss": 0.7394312, "learning_rate": 3.979156275502143e-06, "loss": 0.76255691, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 2.7930386066436768 }, { "auxiliary_loss_clip": 0.01217313, "auxiliary_loss_mlp": 0.01045321, "balance_loss_clip": 1.06979895, "balance_loss_mlp": 1.03422308, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 2.192886830731349, "language_loss": 0.91460049, "learning_rate": 3.979043955878749e-06, "loss": 0.93722683, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 3.4086458683013916 }, { "auxiliary_loss_clip": 0.0123177, "auxiliary_loss_mlp": 0.01043838, "balance_loss_clip": 1.07413077, "balance_loss_mlp": 1.03272164, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 2.0622493458565496, "language_loss": 0.83127832, "learning_rate": 3.978931336035959e-06, "loss": 0.85403436, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.61781644821167 }, { "auxiliary_loss_clip": 0.01250511, "auxiliary_loss_mlp": 0.01052628, "balance_loss_clip": 1.07894623, "balance_loss_mlp": 1.04073715, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 2.438260168862922, "language_loss": 0.8242076, "learning_rate": 3.9788184159908595e-06, "loss": 0.84723902, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.5374293327331543 }, { "auxiliary_loss_clip": 0.01228742, "auxiliary_loss_mlp": 0.01047767, "balance_loss_clip": 1.07099473, "balance_loss_mlp": 1.03744316, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 9.513964664246583, "language_loss": 0.82909536, "learning_rate": 3.97870519576058e-06, "loss": 0.8518604, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.588409900665283 }, { "auxiliary_loss_clip": 0.0121461, "auxiliary_loss_mlp": 0.00764423, "balance_loss_clip": 1.06947434, "balance_loss_mlp": 1.00010991, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 2.4330532550688275, "language_loss": 0.81452489, "learning_rate": 3.978591675362295e-06, "loss": 0.83431524, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.642906665802002 }, { "auxiliary_loss_clip": 0.01199338, "auxiliary_loss_mlp": 0.01044088, "balance_loss_clip": 1.07173538, "balance_loss_mlp": 1.03231573, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 1.8016025446176207, "language_loss": 0.87751627, "learning_rate": 3.978477854813226e-06, "loss": 0.8999505, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.636841297149658 }, { "auxiliary_loss_clip": 0.01247553, "auxiliary_loss_mlp": 0.01044285, "balance_loss_clip": 1.07349169, "balance_loss_mlp": 1.03363967, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 1.9851511149859102, "language_loss": 0.82149374, "learning_rate": 3.97836373413064e-06, "loss": 0.84441221, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.582611560821533 }, { "auxiliary_loss_clip": 0.01265033, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.07529688, "balance_loss_mlp": 1.0369916, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 1.948528439334513, "language_loss": 0.7498638, "learning_rate": 3.978249313331848e-06, "loss": 0.77300143, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.5087220668792725 }, { "auxiliary_loss_clip": 0.01253324, "auxiliary_loss_mlp": 0.00764256, "balance_loss_clip": 1.07240677, "balance_loss_mlp": 1.0000813, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 5.788076286824976, "language_loss": 0.62327188, "learning_rate": 3.978134592434208e-06, "loss": 0.6434477, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.549100875854492 }, { "auxiliary_loss_clip": 0.01098796, "auxiliary_loss_mlp": 0.01009167, "balance_loss_clip": 1.04131293, "balance_loss_mlp": 1.00509024, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.0659664905712698, "language_loss": 0.59384716, "learning_rate": 3.978019571455123e-06, "loss": 0.61492682, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.2172605991363525 }, { "auxiliary_loss_clip": 0.01263272, "auxiliary_loss_mlp": 0.01041318, "balance_loss_clip": 1.07766306, "balance_loss_mlp": 1.03041601, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 2.534020719606259, "language_loss": 0.83666533, "learning_rate": 3.977904250412042e-06, "loss": 0.85971129, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.521507978439331 }, { "auxiliary_loss_clip": 0.0123606, "auxiliary_loss_mlp": 0.01047614, "balance_loss_clip": 1.07108498, "balance_loss_mlp": 1.03695107, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.0673782708218473, "language_loss": 0.85584974, "learning_rate": 3.97778862932246e-06, "loss": 0.87868649, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.5648880004882812 }, { "auxiliary_loss_clip": 0.01129217, "auxiliary_loss_mlp": 0.01042748, "balance_loss_clip": 1.0504632, "balance_loss_mlp": 1.03126812, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.1042966245299923, "language_loss": 0.94337523, "learning_rate": 3.9776727082039144e-06, "loss": 0.96509486, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 3.1057932376861572 }, { "auxiliary_loss_clip": 0.01157881, "auxiliary_loss_mlp": 0.01003554, "balance_loss_clip": 1.0426898, "balance_loss_mlp": 0.99954891, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.8047699436480924, "language_loss": 0.55511081, "learning_rate": 3.977556487073991e-06, "loss": 0.57672513, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.328406572341919 }, { "auxiliary_loss_clip": 0.01220475, "auxiliary_loss_mlp": 0.01043024, "balance_loss_clip": 1.06516349, "balance_loss_mlp": 1.03237891, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 1.6419205092186242, "language_loss": 0.81547046, "learning_rate": 3.97743996595032e-06, "loss": 0.83810544, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.6144723892211914 }, { "auxiliary_loss_clip": 0.01265052, "auxiliary_loss_mlp": 0.0104693, "balance_loss_clip": 1.07586026, "balance_loss_mlp": 1.03475857, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.5147758449836068, "language_loss": 0.81736451, "learning_rate": 3.9773231448505804e-06, "loss": 0.84048432, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.5609283447265625 }, { "auxiliary_loss_clip": 0.01230244, "auxiliary_loss_mlp": 0.00764692, "balance_loss_clip": 1.07415557, "balance_loss_mlp": 1.00004554, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 10.763873394752201, "language_loss": 0.78216302, "learning_rate": 3.977206023792491e-06, "loss": 0.80211234, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.566572427749634 }, { "auxiliary_loss_clip": 0.01248069, "auxiliary_loss_mlp": 0.01046168, "balance_loss_clip": 1.0761708, "balance_loss_mlp": 1.0354569, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 2.0329079762414906, "language_loss": 0.81166631, "learning_rate": 3.97708860279382e-06, "loss": 0.83460867, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.5181567668914795 }, { "auxiliary_loss_clip": 0.01210637, "auxiliary_loss_mlp": 0.01047851, "balance_loss_clip": 1.06811142, "balance_loss_mlp": 1.03683031, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 2.607175778071035, "language_loss": 0.78069383, "learning_rate": 3.97697088187238e-06, "loss": 0.80327868, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.6024010181427 }, { "auxiliary_loss_clip": 0.01227081, "auxiliary_loss_mlp": 0.0104762, "balance_loss_clip": 1.07290816, "balance_loss_mlp": 1.03699255, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 2.119438085117736, "language_loss": 0.91744316, "learning_rate": 3.976852861046029e-06, "loss": 0.9401902, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 2.5320682525634766 }, { "auxiliary_loss_clip": 0.01195335, "auxiliary_loss_mlp": 0.01044347, "balance_loss_clip": 1.06777835, "balance_loss_mlp": 1.03368938, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 1.7410008581829073, "language_loss": 0.80302382, "learning_rate": 3.97673454033267e-06, "loss": 0.82542062, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 3.5156381130218506 }, { "auxiliary_loss_clip": 0.01232599, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.06953156, "balance_loss_mlp": 1.02456307, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 1.9749917261751988, "language_loss": 0.82543135, "learning_rate": 3.976615919750254e-06, "loss": 0.84811139, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 2.5624449253082275 }, { "auxiliary_loss_clip": 0.01245732, "auxiliary_loss_mlp": 0.01043627, "balance_loss_clip": 1.07404852, "balance_loss_mlp": 1.03307092, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 2.0249986741633776, "language_loss": 0.86907494, "learning_rate": 3.976496999316775e-06, "loss": 0.89196849, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 3.3537442684173584 }, { "auxiliary_loss_clip": 0.01228062, "auxiliary_loss_mlp": 0.01038452, "balance_loss_clip": 1.07467937, "balance_loss_mlp": 1.02707911, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 1.9735132389141057, "language_loss": 0.84280801, "learning_rate": 3.976377779050271e-06, "loss": 0.86547315, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 2.575831413269043 }, { "auxiliary_loss_clip": 0.01234936, "auxiliary_loss_mlp": 0.01059784, "balance_loss_clip": 1.07109261, "balance_loss_mlp": 1.04786301, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 2.118699492365852, "language_loss": 0.84234107, "learning_rate": 3.976258258968831e-06, "loss": 0.86528826, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 3.278416633605957 }, { "auxiliary_loss_clip": 0.01217054, "auxiliary_loss_mlp": 0.01051357, "balance_loss_clip": 1.07268691, "balance_loss_mlp": 1.04037189, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.0777545475289174, "language_loss": 0.74172831, "learning_rate": 3.976138439090583e-06, "loss": 0.76441234, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 3.4510860443115234 }, { "auxiliary_loss_clip": 0.01219911, "auxiliary_loss_mlp": 0.01037999, "balance_loss_clip": 1.07361889, "balance_loss_mlp": 1.02625692, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 2.3559524373423875, "language_loss": 0.85221779, "learning_rate": 3.976018319433706e-06, "loss": 0.87479687, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.6218042373657227 }, { "auxiliary_loss_clip": 0.0124986, "auxiliary_loss_mlp": 0.0104822, "balance_loss_clip": 1.07748771, "balance_loss_mlp": 1.03698444, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 2.5852670402787585, "language_loss": 0.91375577, "learning_rate": 3.9758979000164205e-06, "loss": 0.93673652, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.5280842781066895 }, { "auxiliary_loss_clip": 0.01216573, "auxiliary_loss_mlp": 0.01047688, "balance_loss_clip": 1.06968379, "balance_loss_mlp": 1.03570163, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 2.250179591520519, "language_loss": 0.72282112, "learning_rate": 3.975777180856995e-06, "loss": 0.74546373, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.742828130722046 }, { "auxiliary_loss_clip": 0.012623, "auxiliary_loss_mlp": 0.01052915, "balance_loss_clip": 1.07408202, "balance_loss_mlp": 1.04030252, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 2.183035153385274, "language_loss": 0.86146283, "learning_rate": 3.975656161973742e-06, "loss": 0.88461494, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.5272269248962402 }, { "auxiliary_loss_clip": 0.01264425, "auxiliary_loss_mlp": 0.0103884, "balance_loss_clip": 1.07566297, "balance_loss_mlp": 1.02681732, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 2.484751720943226, "language_loss": 0.88969177, "learning_rate": 3.9755348433850194e-06, "loss": 0.91272449, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.51649808883667 }, { "auxiliary_loss_clip": 0.01118763, "auxiliary_loss_mlp": 0.01028048, "balance_loss_clip": 1.03256679, "balance_loss_mlp": 1.02432883, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9816403410167297, "language_loss": 0.63708246, "learning_rate": 3.975413225109232e-06, "loss": 0.65855062, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.2101926803588867 }, { "auxiliary_loss_clip": 0.01247308, "auxiliary_loss_mlp": 0.01046358, "balance_loss_clip": 1.07406116, "balance_loss_mlp": 1.03492594, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 5.396901363328418, "language_loss": 0.93887901, "learning_rate": 3.975291307164829e-06, "loss": 0.96181571, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.549163818359375 }, { "auxiliary_loss_clip": 0.01209529, "auxiliary_loss_mlp": 0.01041569, "balance_loss_clip": 1.06732082, "balance_loss_mlp": 1.03109622, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 2.5429683545436044, "language_loss": 0.84950566, "learning_rate": 3.975169089570306e-06, "loss": 0.87201667, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.5864341259002686 }, { "auxiliary_loss_clip": 0.01229129, "auxiliary_loss_mlp": 0.01054227, "balance_loss_clip": 1.06942475, "balance_loss_mlp": 1.0433197, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 1.9945082297658308, "language_loss": 0.91684657, "learning_rate": 3.975046572344202e-06, "loss": 0.9396801, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.5253217220306396 }, { "auxiliary_loss_clip": 0.0120916, "auxiliary_loss_mlp": 0.01043478, "balance_loss_clip": 1.06338, "balance_loss_mlp": 1.03276706, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 1.840019596077486, "language_loss": 0.71130049, "learning_rate": 3.974923755505103e-06, "loss": 0.73382694, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.610306978225708 }, { "auxiliary_loss_clip": 0.01202773, "auxiliary_loss_mlp": 0.01045558, "balance_loss_clip": 1.06643629, "balance_loss_mlp": 1.03466225, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 1.6410328173398758, "language_loss": 0.9120667, "learning_rate": 3.974800639071641e-06, "loss": 0.93455005, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.6287972927093506 }, { "auxiliary_loss_clip": 0.01166353, "auxiliary_loss_mlp": 0.00764418, "balance_loss_clip": 1.06033051, "balance_loss_mlp": 1.0000422, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 2.3022066213681387, "language_loss": 1.00449967, "learning_rate": 3.974677223062492e-06, "loss": 1.02380741, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.663703203201294 }, { "auxiliary_loss_clip": 0.01229152, "auxiliary_loss_mlp": 0.01047884, "balance_loss_clip": 1.07331169, "balance_loss_mlp": 1.03629112, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 2.2380522093004376, "language_loss": 0.74331439, "learning_rate": 3.974553507496378e-06, "loss": 0.76608485, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 2.540658712387085 }, { "auxiliary_loss_clip": 0.01217108, "auxiliary_loss_mlp": 0.01051161, "balance_loss_clip": 1.07054913, "balance_loss_mlp": 1.03918672, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.6919412560332487, "language_loss": 0.89475942, "learning_rate": 3.974429492392068e-06, "loss": 0.91744208, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.596468448638916 }, { "auxiliary_loss_clip": 0.01262585, "auxiliary_loss_mlp": 0.00764029, "balance_loss_clip": 1.07624722, "balance_loss_mlp": 1.00003386, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 1.8403358785575636, "language_loss": 0.90973181, "learning_rate": 3.974305177768373e-06, "loss": 0.92999792, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.4790172576904297 }, { "auxiliary_loss_clip": 0.01205186, "auxiliary_loss_mlp": 0.01052145, "balance_loss_clip": 1.07001829, "balance_loss_mlp": 1.04060543, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 2.304305483605101, "language_loss": 0.86608994, "learning_rate": 3.974180563644152e-06, "loss": 0.88866329, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.611039161682129 }, { "auxiliary_loss_clip": 0.01232641, "auxiliary_loss_mlp": 0.01059702, "balance_loss_clip": 1.07253551, "balance_loss_mlp": 1.04877639, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 2.652742128899269, "language_loss": 0.89274442, "learning_rate": 3.97405565003831e-06, "loss": 0.91566783, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.5410053730010986 }, { "auxiliary_loss_clip": 0.01217088, "auxiliary_loss_mlp": 0.01048503, "balance_loss_clip": 1.06942248, "balance_loss_mlp": 1.03648663, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 3.11565892683201, "language_loss": 0.78300601, "learning_rate": 3.973930436969794e-06, "loss": 0.80566192, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.5721709728240967 }, { "auxiliary_loss_clip": 0.01215038, "auxiliary_loss_mlp": 0.01045687, "balance_loss_clip": 1.06612158, "balance_loss_mlp": 1.03390288, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 1.7785638732380034, "language_loss": 0.85457528, "learning_rate": 3.973804924457602e-06, "loss": 0.87718248, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 2.5511574745178223 }, { "auxiliary_loss_clip": 0.01216327, "auxiliary_loss_mlp": 0.01055205, "balance_loss_clip": 1.06651843, "balance_loss_mlp": 1.04362345, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 1.8682061671208274, "language_loss": 0.8556028, "learning_rate": 3.973679112520771e-06, "loss": 0.87831813, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 2.659978151321411 }, { "auxiliary_loss_clip": 0.01204722, "auxiliary_loss_mlp": 0.0104045, "balance_loss_clip": 1.06583059, "balance_loss_mlp": 1.02963793, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 2.097500647352574, "language_loss": 0.98953331, "learning_rate": 3.973553001178389e-06, "loss": 1.01198506, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 3.3798298835754395 }, { "auxiliary_loss_clip": 0.01213259, "auxiliary_loss_mlp": 0.01043982, "balance_loss_clip": 1.07165504, "balance_loss_mlp": 1.03238869, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 2.006771034728547, "language_loss": 0.7574355, "learning_rate": 3.973426590449585e-06, "loss": 0.78000796, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 2.624657154083252 }, { "auxiliary_loss_clip": 0.01195969, "auxiliary_loss_mlp": 0.01060293, "balance_loss_clip": 1.06792402, "balance_loss_mlp": 1.04887271, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 1.927283123588554, "language_loss": 0.75457543, "learning_rate": 3.9732998803535364e-06, "loss": 0.77713805, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 2.6177937984466553 }, { "auxiliary_loss_clip": 0.01261682, "auxiliary_loss_mlp": 0.0104671, "balance_loss_clip": 1.07504272, "balance_loss_mlp": 1.03591013, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.2056199452305605, "language_loss": 0.85208964, "learning_rate": 3.973172870909465e-06, "loss": 0.87517357, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 3.267508029937744 }, { "auxiliary_loss_clip": 0.01233209, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.06971145, "balance_loss_mlp": 1.03045964, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 2.8009083776990638, "language_loss": 0.80655849, "learning_rate": 3.973045562136638e-06, "loss": 0.82931119, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 3.340522050857544 }, { "auxiliary_loss_clip": 0.01248115, "auxiliary_loss_mlp": 0.01042388, "balance_loss_clip": 1.07226694, "balance_loss_mlp": 1.03157592, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 2.85389859891496, "language_loss": 0.91609263, "learning_rate": 3.972917954054368e-06, "loss": 0.93899763, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 3.3753490447998047 }, { "auxiliary_loss_clip": 0.01225998, "auxiliary_loss_mlp": 0.01058112, "balance_loss_clip": 1.07287645, "balance_loss_mlp": 1.0451597, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 2.6117459340132902, "language_loss": 0.81511486, "learning_rate": 3.972790046682013e-06, "loss": 0.83795589, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.5614469051361084 }, { "auxiliary_loss_clip": 0.01216137, "auxiliary_loss_mlp": 0.01044367, "balance_loss_clip": 1.06608772, "balance_loss_mlp": 1.0328747, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 2.0525096280221815, "language_loss": 0.7904824, "learning_rate": 3.972661840038977e-06, "loss": 0.8130874, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.6291770935058594 }, { "auxiliary_loss_clip": 0.01244688, "auxiliary_loss_mlp": 0.01053148, "balance_loss_clip": 1.07299435, "balance_loss_mlp": 1.04100657, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.417626437162247, "language_loss": 0.83719176, "learning_rate": 3.972533334144707e-06, "loss": 0.86017013, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.5016086101531982 }, { "auxiliary_loss_clip": 0.01248695, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 1.07147455, "balance_loss_mlp": 1.02961719, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 2.0113015524865574, "language_loss": 0.78591239, "learning_rate": 3.972404529018699e-06, "loss": 0.80882138, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.5523087978363037 }, { "auxiliary_loss_clip": 0.01220624, "auxiliary_loss_mlp": 0.01046614, "balance_loss_clip": 1.06243527, "balance_loss_mlp": 1.03534842, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 1.73388659445206, "language_loss": 0.85619766, "learning_rate": 3.972275424680493e-06, "loss": 0.87887001, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.5865321159362793 }, { "auxiliary_loss_clip": 0.01260845, "auxiliary_loss_mlp": 0.01045511, "balance_loss_clip": 1.07374597, "balance_loss_mlp": 1.03440642, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 2.009376756079574, "language_loss": 0.91422939, "learning_rate": 3.972146021149673e-06, "loss": 0.93729293, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.4957919120788574 }, { "auxiliary_loss_clip": 0.01211441, "auxiliary_loss_mlp": 0.01046124, "balance_loss_clip": 1.06708813, "balance_loss_mlp": 1.03540659, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 2.566822069432394, "language_loss": 0.78518242, "learning_rate": 3.972016318445868e-06, "loss": 0.80775803, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.5734832286834717 }, { "auxiliary_loss_clip": 0.01243731, "auxiliary_loss_mlp": 0.01044237, "balance_loss_clip": 1.0721519, "balance_loss_mlp": 1.03287625, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 2.02559103890426, "language_loss": 0.91878617, "learning_rate": 3.971886316588757e-06, "loss": 0.94166583, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.537391185760498 }, { "auxiliary_loss_clip": 0.01199138, "auxiliary_loss_mlp": 0.01040944, "balance_loss_clip": 1.06776381, "balance_loss_mlp": 1.02889204, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 2.8378798534785616, "language_loss": 0.73280638, "learning_rate": 3.9717560155980595e-06, "loss": 0.75520724, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.5645196437835693 }, { "auxiliary_loss_clip": 0.01240543, "auxiliary_loss_mlp": 0.01046081, "balance_loss_clip": 1.06895685, "balance_loss_mlp": 1.03513122, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 6.972362921006647, "language_loss": 0.91931212, "learning_rate": 3.971625415493542e-06, "loss": 0.94217837, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.520059585571289 }, { "auxiliary_loss_clip": 0.01203308, "auxiliary_loss_mlp": 0.0104202, "balance_loss_clip": 1.06274617, "balance_loss_mlp": 1.03062391, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 1.920820538584633, "language_loss": 0.87649381, "learning_rate": 3.971494516295017e-06, "loss": 0.89894712, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.6189119815826416 }, { "auxiliary_loss_clip": 0.01214251, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.06645513, "balance_loss_mlp": 1.02564704, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 1.9946330082411483, "language_loss": 0.85407424, "learning_rate": 3.971363318022341e-06, "loss": 0.87658024, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.6549503803253174 }, { "auxiliary_loss_clip": 0.01227666, "auxiliary_loss_mlp": 0.01042077, "balance_loss_clip": 1.06573975, "balance_loss_mlp": 1.03070426, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 1.7906261735343931, "language_loss": 0.68415052, "learning_rate": 3.971231820695417e-06, "loss": 0.70684803, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 2.7292287349700928 }, { "auxiliary_loss_clip": 0.01231051, "auxiliary_loss_mlp": 0.01042396, "balance_loss_clip": 1.07005107, "balance_loss_mlp": 1.03173852, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 2.025721019445671, "language_loss": 0.81261134, "learning_rate": 3.971100024334193e-06, "loss": 0.83534575, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.5721166133880615 }, { "auxiliary_loss_clip": 0.01194395, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.06104183, "balance_loss_mlp": 1.03173518, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 2.0201295455575488, "language_loss": 0.86249757, "learning_rate": 3.970967928958663e-06, "loss": 0.8848685, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.566673517227173 }, { "auxiliary_loss_clip": 0.01204227, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.06762934, "balance_loss_mlp": 1.04137075, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 1.5972583763447181, "language_loss": 0.83240515, "learning_rate": 3.970835534588865e-06, "loss": 0.85497129, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.6124744415283203 }, { "auxiliary_loss_clip": 0.01233614, "auxiliary_loss_mlp": 0.01058963, "balance_loss_clip": 1.07595003, "balance_loss_mlp": 1.04805541, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 1.8772492461618835, "language_loss": 0.85627699, "learning_rate": 3.970702841244883e-06, "loss": 0.87920272, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.537780523300171 }, { "auxiliary_loss_clip": 0.01247349, "auxiliary_loss_mlp": 0.01049108, "balance_loss_clip": 1.07395029, "balance_loss_mlp": 1.03638256, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 1.734435621262683, "language_loss": 0.82381076, "learning_rate": 3.970569848946847e-06, "loss": 0.84677535, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.5620083808898926 }, { "auxiliary_loss_clip": 0.01225842, "auxiliary_loss_mlp": 0.01042153, "balance_loss_clip": 1.06828701, "balance_loss_mlp": 1.03104305, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 5.293191812118151, "language_loss": 0.82546532, "learning_rate": 3.970436557714932e-06, "loss": 0.84814525, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 2.4818572998046875 }, { "auxiliary_loss_clip": 0.01223005, "auxiliary_loss_mlp": 0.01043426, "balance_loss_clip": 1.07022727, "balance_loss_mlp": 1.0313735, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 2.0163193963883908, "language_loss": 0.86339194, "learning_rate": 3.970302967569358e-06, "loss": 0.88605624, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.584277868270874 }, { "auxiliary_loss_clip": 0.01243672, "auxiliary_loss_mlp": 0.01041273, "balance_loss_clip": 1.07406616, "balance_loss_mlp": 1.03042507, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 2.305276933170146, "language_loss": 0.68179417, "learning_rate": 3.9701690785303896e-06, "loss": 0.70464361, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 3.3895256519317627 }, { "auxiliary_loss_clip": 0.01244017, "auxiliary_loss_mlp": 0.01038234, "balance_loss_clip": 1.06888223, "balance_loss_mlp": 1.02693868, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.1474723815963217, "language_loss": 0.8856231, "learning_rate": 3.970034890618339e-06, "loss": 0.90844566, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 2.5535168647766113 }, { "auxiliary_loss_clip": 0.01224507, "auxiliary_loss_mlp": 0.01042333, "balance_loss_clip": 1.06702662, "balance_loss_mlp": 1.0322237, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 22.914590350622273, "language_loss": 0.87910128, "learning_rate": 3.969900403853562e-06, "loss": 0.90176964, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 3.327944755554199 }, { "auxiliary_loss_clip": 0.01264535, "auxiliary_loss_mlp": 0.01045868, "balance_loss_clip": 1.07964706, "balance_loss_mlp": 1.0348587, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 1.582657197330352, "language_loss": 0.77917379, "learning_rate": 3.96976561825646e-06, "loss": 0.8022778, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 3.2368052005767822 }, { "auxiliary_loss_clip": 0.01196744, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.06494689, "balance_loss_mlp": 1.03292966, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 3.8433797460103665, "language_loss": 0.86889958, "learning_rate": 3.969630533847479e-06, "loss": 0.89131206, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 2.660919666290283 }, { "auxiliary_loss_clip": 0.01240881, "auxiliary_loss_mlp": 0.01047536, "balance_loss_clip": 1.06835032, "balance_loss_mlp": 1.03693259, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 3.5340369429234304, "language_loss": 0.84018809, "learning_rate": 3.969495150647113e-06, "loss": 0.86307216, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 2.5112462043762207 }, { "auxiliary_loss_clip": 0.01207785, "auxiliary_loss_mlp": 0.01046834, "balance_loss_clip": 1.06958497, "balance_loss_mlp": 1.03584874, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 1.746620662816309, "language_loss": 0.76487982, "learning_rate": 3.969359468675899e-06, "loss": 0.78742599, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 3.420692205429077 }, { "auxiliary_loss_clip": 0.01240851, "auxiliary_loss_mlp": 0.01037719, "balance_loss_clip": 1.07236218, "balance_loss_mlp": 1.02741981, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 1.9754281474535698, "language_loss": 0.89508402, "learning_rate": 3.969223487954418e-06, "loss": 0.91786969, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.515623092651367 }, { "auxiliary_loss_clip": 0.01196511, "auxiliary_loss_mlp": 0.0104933, "balance_loss_clip": 1.06763792, "balance_loss_mlp": 1.0386076, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 2.5637877075434625, "language_loss": 0.82410628, "learning_rate": 3.969087208503301e-06, "loss": 0.84656465, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.6498188972473145 }, { "auxiliary_loss_clip": 0.01190836, "auxiliary_loss_mlp": 0.01044117, "balance_loss_clip": 1.06499684, "balance_loss_mlp": 1.03214216, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 2.0917302696645246, "language_loss": 0.84440154, "learning_rate": 3.968950630343219e-06, "loss": 0.86675107, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.625582456588745 }, { "auxiliary_loss_clip": 0.01222835, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.06576729, "balance_loss_mlp": 1.03551149, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 2.2477520888216778, "language_loss": 0.93543708, "learning_rate": 3.968813753494892e-06, "loss": 0.95813811, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.5313544273376465 }, { "auxiliary_loss_clip": 0.01199034, "auxiliary_loss_mlp": 0.00764444, "balance_loss_clip": 1.06338692, "balance_loss_mlp": 0.99998397, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 2.0996590777265762, "language_loss": 0.75297862, "learning_rate": 3.968676577979084e-06, "loss": 0.77261341, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.6507866382598877 }, { "auxiliary_loss_clip": 0.0119077, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.06155872, "balance_loss_mlp": 1.03196859, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 2.0596626757152294, "language_loss": 0.78309339, "learning_rate": 3.968539103816605e-06, "loss": 0.80543375, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.5788023471832275 }, { "auxiliary_loss_clip": 0.01221809, "auxiliary_loss_mlp": 0.007646, "balance_loss_clip": 1.06720722, "balance_loss_mlp": 1.00001335, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 1.971108933895679, "language_loss": 0.8932091, "learning_rate": 3.9684013310283085e-06, "loss": 0.91307318, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.6142454147338867 }, { "auxiliary_loss_clip": 0.01227841, "auxiliary_loss_mlp": 0.0105383, "balance_loss_clip": 1.07463467, "balance_loss_mlp": 1.04274988, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 1.922811636898232, "language_loss": 0.63755035, "learning_rate": 3.9682632596350956e-06, "loss": 0.66036713, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.7088584899902344 }, { "auxiliary_loss_clip": 0.01239393, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.0713799, "balance_loss_mlp": 1.02483714, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 1.9172139736044205, "language_loss": 0.78341269, "learning_rate": 3.968124889657911e-06, "loss": 0.80616057, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.4899802207946777 }, { "auxiliary_loss_clip": 0.01194215, "auxiliary_loss_mlp": 0.01047943, "balance_loss_clip": 1.06302011, "balance_loss_mlp": 1.03677261, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.2804779315666397, "language_loss": 0.90672714, "learning_rate": 3.967986221117746e-06, "loss": 0.92914867, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.5875139236450195 }, { "auxiliary_loss_clip": 0.01171299, "auxiliary_loss_mlp": 0.0104113, "balance_loss_clip": 1.06226516, "balance_loss_mlp": 1.02911925, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 2.1460700894606624, "language_loss": 0.86490011, "learning_rate": 3.967847254035635e-06, "loss": 0.8870244, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 2.7098920345306396 }, { "auxiliary_loss_clip": 0.01216307, "auxiliary_loss_mlp": 0.01046195, "balance_loss_clip": 1.06927776, "balance_loss_mlp": 1.03444076, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 2.628019355105924, "language_loss": 0.86344844, "learning_rate": 3.967707988432661e-06, "loss": 0.88607347, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 2.5610897541046143 }, { "auxiliary_loss_clip": 0.01253144, "auxiliary_loss_mlp": 0.01047841, "balance_loss_clip": 1.06913018, "balance_loss_mlp": 1.03657532, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 4.448172811454269, "language_loss": 0.87873793, "learning_rate": 3.967568424329949e-06, "loss": 0.90174782, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.5360701084136963 }, { "auxiliary_loss_clip": 0.01136642, "auxiliary_loss_mlp": 0.01015249, "balance_loss_clip": 1.04263878, "balance_loss_mlp": 1.01143456, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8233994813524171, "language_loss": 0.55554038, "learning_rate": 3.967428561748671e-06, "loss": 0.57705933, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.268935203552246 }, { "auxiliary_loss_clip": 0.0118782, "auxiliary_loss_mlp": 0.01047991, "balance_loss_clip": 1.06094038, "balance_loss_mlp": 1.03619576, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 1.9155834690868487, "language_loss": 0.8773253, "learning_rate": 3.967288400710045e-06, "loss": 0.89968348, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.6126434803009033 }, { "auxiliary_loss_clip": 0.01211115, "auxiliary_loss_mlp": 0.01046168, "balance_loss_clip": 1.07375634, "balance_loss_mlp": 1.03530192, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 1.9398796645487912, "language_loss": 0.88494527, "learning_rate": 3.9671479412353335e-06, "loss": 0.90751803, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.596857786178589 }, { "auxiliary_loss_clip": 0.01241492, "auxiliary_loss_mlp": 0.01046501, "balance_loss_clip": 1.07251263, "balance_loss_mlp": 1.03509295, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 2.117115523251957, "language_loss": 0.73955894, "learning_rate": 3.967007183345843e-06, "loss": 0.76243889, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.5691041946411133 }, { "auxiliary_loss_clip": 0.01236991, "auxiliary_loss_mlp": 0.01042304, "balance_loss_clip": 1.07244885, "balance_loss_mlp": 1.03187311, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 2.171576951205799, "language_loss": 0.89603287, "learning_rate": 3.966866127062927e-06, "loss": 0.91882581, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 2.4913113117218018 }, { "auxiliary_loss_clip": 0.01132519, "auxiliary_loss_mlp": 0.01004081, "balance_loss_clip": 1.03746223, "balance_loss_mlp": 1.00040913, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.8698767782351514, "language_loss": 0.62731743, "learning_rate": 3.966724772407982e-06, "loss": 0.64868343, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 2.9686620235443115 }, { "auxiliary_loss_clip": 0.01202048, "auxiliary_loss_mlp": 0.01045233, "balance_loss_clip": 1.06728303, "balance_loss_mlp": 1.03455222, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 2.4279615561576278, "language_loss": 0.8858425, "learning_rate": 3.966583119402454e-06, "loss": 0.90831536, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 3.4400806427001953 }, { "auxiliary_loss_clip": 0.0123849, "auxiliary_loss_mlp": 0.00764466, "balance_loss_clip": 1.07219291, "balance_loss_mlp": 1.00001526, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 1.7942031630823065, "language_loss": 0.82150275, "learning_rate": 3.9664411680678305e-06, "loss": 0.84153223, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 2.632458448410034 }, { "auxiliary_loss_clip": 0.01110107, "auxiliary_loss_mlp": 0.01004941, "balance_loss_clip": 1.03540564, "balance_loss_mlp": 1.00134075, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.8409224373736027, "language_loss": 0.61443877, "learning_rate": 3.966298918425644e-06, "loss": 0.63558924, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 4.607870578765869 }, { "auxiliary_loss_clip": 0.01242981, "auxiliary_loss_mlp": 0.01048659, "balance_loss_clip": 1.06984723, "balance_loss_mlp": 1.03755474, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 2.315905477403946, "language_loss": 0.82935256, "learning_rate": 3.966156370497476e-06, "loss": 0.85226893, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 2.64089298248291 }, { "auxiliary_loss_clip": 0.01245954, "auxiliary_loss_mlp": 0.01056159, "balance_loss_clip": 1.07196927, "balance_loss_mlp": 1.04522777, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 1.9754117695694542, "language_loss": 0.8817541, "learning_rate": 3.96601352430495e-06, "loss": 0.90477526, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 2.530644178390503 }, { "auxiliary_loss_clip": 0.01226445, "auxiliary_loss_mlp": 0.01053512, "balance_loss_clip": 1.07198882, "balance_loss_mlp": 1.04270625, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 1.5711549722037426, "language_loss": 0.83087593, "learning_rate": 3.965870379869735e-06, "loss": 0.85367548, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 3.46901798248291 }, { "auxiliary_loss_clip": 0.01241289, "auxiliary_loss_mlp": 0.01041565, "balance_loss_clip": 1.0689038, "balance_loss_mlp": 1.0302937, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.114664527946292, "language_loss": 0.86905932, "learning_rate": 3.965726937213547e-06, "loss": 0.8918879, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.5195484161376953 }, { "auxiliary_loss_clip": 0.01237242, "auxiliary_loss_mlp": 0.01050852, "balance_loss_clip": 1.06628406, "balance_loss_mlp": 1.03949761, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 3.948081079557401, "language_loss": 0.81172085, "learning_rate": 3.965583196358144e-06, "loss": 0.83460176, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.514963150024414 }, { "auxiliary_loss_clip": 0.01257923, "auxiliary_loss_mlp": 0.01046486, "balance_loss_clip": 1.07250571, "balance_loss_mlp": 1.03514314, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 2.281121079126419, "language_loss": 0.74573541, "learning_rate": 3.965439157325335e-06, "loss": 0.76877952, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.45369553565979 }, { "auxiliary_loss_clip": 0.01216035, "auxiliary_loss_mlp": 0.01042981, "balance_loss_clip": 1.06279504, "balance_loss_mlp": 1.03151286, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 1.8219668586111268, "language_loss": 0.75805223, "learning_rate": 3.965294820136968e-06, "loss": 0.78064233, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.6602978706359863 }, { "auxiliary_loss_clip": 0.01222097, "auxiliary_loss_mlp": 0.01054354, "balance_loss_clip": 1.07008469, "balance_loss_mlp": 1.04184937, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 1.9740549905113545, "language_loss": 0.87062752, "learning_rate": 3.965150184814938e-06, "loss": 0.89339203, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.5938923358917236 }, { "auxiliary_loss_clip": 0.01213564, "auxiliary_loss_mlp": 0.01045629, "balance_loss_clip": 1.06544578, "balance_loss_mlp": 1.03388715, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 2.09990304459907, "language_loss": 0.76963872, "learning_rate": 3.965005251381189e-06, "loss": 0.79223061, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.561878204345703 }, { "auxiliary_loss_clip": 0.01129354, "auxiliary_loss_mlp": 0.01005951, "balance_loss_clip": 1.02994061, "balance_loss_mlp": 1.00227916, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.8893821239611491, "language_loss": 0.64625722, "learning_rate": 3.964860019857705e-06, "loss": 0.66761029, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.119004964828491 }, { "auxiliary_loss_clip": 0.01255765, "auxiliary_loss_mlp": 0.01041336, "balance_loss_clip": 1.07386506, "balance_loss_mlp": 1.0305295, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 1.6981561883948681, "language_loss": 0.84292877, "learning_rate": 3.964714490266518e-06, "loss": 0.8658998, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.493178606033325 }, { "auxiliary_loss_clip": 0.01122995, "auxiliary_loss_mlp": 0.01005682, "balance_loss_clip": 1.02938759, "balance_loss_mlp": 1.0020107, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.8877097745259482, "language_loss": 0.64589357, "learning_rate": 3.964568662629706e-06, "loss": 0.6671803, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.017296314239502 }, { "auxiliary_loss_clip": 0.01233277, "auxiliary_loss_mlp": 0.0103969, "balance_loss_clip": 1.06586373, "balance_loss_mlp": 1.02829921, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 2.0811808846829742, "language_loss": 0.84133303, "learning_rate": 3.9644225369693895e-06, "loss": 0.86406273, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.5731072425842285 }, { "auxiliary_loss_clip": 0.01255918, "auxiliary_loss_mlp": 0.01049824, "balance_loss_clip": 1.07482886, "balance_loss_mlp": 1.03842151, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 1.9834170849661659, "language_loss": 0.86897874, "learning_rate": 3.964276113307735e-06, "loss": 0.89203614, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.5332117080688477 }, { "auxiliary_loss_clip": 0.01207955, "auxiliary_loss_mlp": 0.01047424, "balance_loss_clip": 1.06933761, "balance_loss_mlp": 1.03580117, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.880779118197899, "language_loss": 0.80800509, "learning_rate": 3.9641293916669574e-06, "loss": 0.8305589, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.582185983657837 }, { "auxiliary_loss_clip": 0.01203548, "auxiliary_loss_mlp": 0.01038206, "balance_loss_clip": 1.06590128, "balance_loss_mlp": 1.02607083, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 2.131178367497206, "language_loss": 0.82845724, "learning_rate": 3.9639823720693115e-06, "loss": 0.85087484, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.6417157649993896 }, { "auxiliary_loss_clip": 0.01105564, "auxiliary_loss_mlp": 0.0101166, "balance_loss_clip": 1.03725469, "balance_loss_mlp": 1.00827408, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8351373750601038, "language_loss": 0.60022497, "learning_rate": 3.963835054537102e-06, "loss": 0.6213972, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.235017776489258 }, { "auxiliary_loss_clip": 0.01219474, "auxiliary_loss_mlp": 0.01049937, "balance_loss_clip": 1.06283939, "balance_loss_mlp": 1.03970885, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 2.740952902750381, "language_loss": 0.60750973, "learning_rate": 3.963687439092676e-06, "loss": 0.63020384, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.574176073074341 }, { "auxiliary_loss_clip": 0.01237672, "auxiliary_loss_mlp": 0.01038952, "balance_loss_clip": 1.07051992, "balance_loss_mlp": 1.02819383, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 2.00325796831368, "language_loss": 0.80350685, "learning_rate": 3.963539525758427e-06, "loss": 0.82627308, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.535719871520996 }, { "auxiliary_loss_clip": 0.01226153, "auxiliary_loss_mlp": 0.01042005, "balance_loss_clip": 1.07086885, "balance_loss_mlp": 1.03063226, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 2.9215691528961774, "language_loss": 0.67409343, "learning_rate": 3.9633913145567925e-06, "loss": 0.69677496, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.5982158184051514 }, { "auxiliary_loss_clip": 0.01222857, "auxiliary_loss_mlp": 0.01040881, "balance_loss_clip": 1.06855822, "balance_loss_mlp": 1.03054583, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 1.9140463338723634, "language_loss": 0.81376088, "learning_rate": 3.9632428055102575e-06, "loss": 0.83639824, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.5966713428497314 }, { "auxiliary_loss_clip": 0.01244286, "auxiliary_loss_mlp": 0.0104206, "balance_loss_clip": 1.0729003, "balance_loss_mlp": 1.03047299, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 2.2728602138404734, "language_loss": 0.66841555, "learning_rate": 3.9630939986413495e-06, "loss": 0.69127905, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 2.643235683441162 }, { "auxiliary_loss_clip": 0.0119124, "auxiliary_loss_mlp": 0.01047881, "balance_loss_clip": 1.06294763, "balance_loss_mlp": 1.03630567, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 1.7837066340580356, "language_loss": 0.78000546, "learning_rate": 3.962944893972643e-06, "loss": 0.80239666, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 2.5688962936401367 }, { "auxiliary_loss_clip": 0.01227588, "auxiliary_loss_mlp": 0.01050677, "balance_loss_clip": 1.07382917, "balance_loss_mlp": 1.03951287, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 2.383179000791531, "language_loss": 0.91139805, "learning_rate": 3.962795491526756e-06, "loss": 0.93418074, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 3.3623225688934326 }, { "auxiliary_loss_clip": 0.01260516, "auxiliary_loss_mlp": 0.01041249, "balance_loss_clip": 1.07445717, "balance_loss_mlp": 1.03057432, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.404111055068438, "language_loss": 0.89365947, "learning_rate": 3.962645791326354e-06, "loss": 0.91667712, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 2.5380802154541016 }, { "auxiliary_loss_clip": 0.01237713, "auxiliary_loss_mlp": 0.0104221, "balance_loss_clip": 1.07236433, "balance_loss_mlp": 1.03179085, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 2.6051877897296913, "language_loss": 0.83633399, "learning_rate": 3.962495793394146e-06, "loss": 0.85913324, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 3.239537477493286 }, { "auxiliary_loss_clip": 0.01133871, "auxiliary_loss_mlp": 0.01007408, "balance_loss_clip": 1.02692497, "balance_loss_mlp": 1.00368905, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.7420300803272664, "language_loss": 0.61215812, "learning_rate": 3.9623454977528864e-06, "loss": 0.63357091, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 3.6701695919036865 }, { "auxiliary_loss_clip": 0.01212802, "auxiliary_loss_mlp": 0.01045977, "balance_loss_clip": 1.06826282, "balance_loss_mlp": 1.03382349, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 1.970777033342424, "language_loss": 0.85174465, "learning_rate": 3.962194904425375e-06, "loss": 0.87433243, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 2.6185920238494873 }, { "auxiliary_loss_clip": 0.01236023, "auxiliary_loss_mlp": 0.01041354, "balance_loss_clip": 1.07013035, "balance_loss_mlp": 1.02967775, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 1.8780348303788152, "language_loss": 0.67734766, "learning_rate": 3.9620440134344566e-06, "loss": 0.7001214, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 3.323148250579834 }, { "auxiliary_loss_clip": 0.01207105, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.06600189, "balance_loss_mlp": 1.02753353, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 2.0852204154834713, "language_loss": 0.82407027, "learning_rate": 3.9618928248030215e-06, "loss": 0.84653854, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.5707337856292725 }, { "auxiliary_loss_clip": 0.01236855, "auxiliary_loss_mlp": 0.01049881, "balance_loss_clip": 1.07000899, "balance_loss_mlp": 1.03864515, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 2.5077444509283473, "language_loss": 0.82879514, "learning_rate": 3.961741338554005e-06, "loss": 0.85166252, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.522052526473999 }, { "auxiliary_loss_clip": 0.01229213, "auxiliary_loss_mlp": 0.01050286, "balance_loss_clip": 1.07034528, "balance_loss_mlp": 1.03882408, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 2.929239733972447, "language_loss": 0.75438964, "learning_rate": 3.9615895547103865e-06, "loss": 0.77718472, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.6564793586730957 }, { "auxiliary_loss_clip": 0.01224218, "auxiliary_loss_mlp": 0.01041675, "balance_loss_clip": 1.07002878, "balance_loss_mlp": 1.03126192, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 1.864324490066283, "language_loss": 0.7791456, "learning_rate": 3.961437473295193e-06, "loss": 0.80180454, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.601822853088379 }, { "auxiliary_loss_clip": 0.01179527, "auxiliary_loss_mlp": 0.01044438, "balance_loss_clip": 1.05827498, "balance_loss_mlp": 1.03314924, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 3.308524560375813, "language_loss": 0.72648042, "learning_rate": 3.961285094331495e-06, "loss": 0.74872005, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.610873222351074 }, { "auxiliary_loss_clip": 0.01258728, "auxiliary_loss_mlp": 0.01044727, "balance_loss_clip": 1.07609963, "balance_loss_mlp": 1.0341351, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 1.7085585643146453, "language_loss": 0.85624099, "learning_rate": 3.961132417842406e-06, "loss": 0.8792755, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.5288617610931396 }, { "auxiliary_loss_clip": 0.01232215, "auxiliary_loss_mlp": 0.01044873, "balance_loss_clip": 1.07107306, "balance_loss_mlp": 1.03375673, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 2.93800398921089, "language_loss": 0.75228864, "learning_rate": 3.960979443851089e-06, "loss": 0.77505958, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.519134283065796 }, { "auxiliary_loss_clip": 0.01217618, "auxiliary_loss_mlp": 0.01042819, "balance_loss_clip": 1.06623197, "balance_loss_mlp": 1.03153551, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.6459148665720467, "language_loss": 0.78855079, "learning_rate": 3.96082617238075e-06, "loss": 0.81115514, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.6353423595428467 }, { "auxiliary_loss_clip": 0.01221737, "auxiliary_loss_mlp": 0.01044496, "balance_loss_clip": 1.06830132, "balance_loss_mlp": 1.03405905, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 10.141536389560802, "language_loss": 0.79798836, "learning_rate": 3.960672603454639e-06, "loss": 0.8206507, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.5745768547058105 }, { "auxiliary_loss_clip": 0.01232443, "auxiliary_loss_mlp": 0.01040755, "balance_loss_clip": 1.06903803, "balance_loss_mlp": 1.03016305, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 3.2292339249235185, "language_loss": 0.77025688, "learning_rate": 3.960518737096054e-06, "loss": 0.7929889, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.510680913925171 }, { "auxiliary_loss_clip": 0.01240402, "auxiliary_loss_mlp": 0.01039463, "balance_loss_clip": 1.07233405, "balance_loss_mlp": 1.0286628, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 2.905919305148167, "language_loss": 0.7304734, "learning_rate": 3.960364573328334e-06, "loss": 0.753272, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.5412545204162598 }, { "auxiliary_loss_clip": 0.01206134, "auxiliary_loss_mlp": 0.01040318, "balance_loss_clip": 1.06518328, "balance_loss_mlp": 1.02964902, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 1.8615635050885389, "language_loss": 0.88563764, "learning_rate": 3.9602101121748675e-06, "loss": 0.90810215, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.6115684509277344 }, { "auxiliary_loss_clip": 0.01220669, "auxiliary_loss_mlp": 0.01038575, "balance_loss_clip": 1.06872499, "balance_loss_mlp": 1.02853131, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 2.008012258462435, "language_loss": 0.72656739, "learning_rate": 3.960055353659085e-06, "loss": 0.74915987, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.5066142082214355 }, { "auxiliary_loss_clip": 0.01210592, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.06844819, "balance_loss_mlp": 1.03379476, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 1.7948369977375693, "language_loss": 0.83571273, "learning_rate": 3.959900297804465e-06, "loss": 0.85826242, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.594778060913086 }, { "auxiliary_loss_clip": 0.01205877, "auxiliary_loss_mlp": 0.01047644, "balance_loss_clip": 1.06420135, "balance_loss_mlp": 1.03689098, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 1.9593770955542835, "language_loss": 0.77450544, "learning_rate": 3.9597449446345276e-06, "loss": 0.79704064, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.5255768299102783 }, { "auxiliary_loss_clip": 0.0121129, "auxiliary_loss_mlp": 0.01041075, "balance_loss_clip": 1.0655762, "balance_loss_mlp": 1.03047132, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 2.277951232389627, "language_loss": 0.82826042, "learning_rate": 3.95958929417284e-06, "loss": 0.85078406, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 2.554725170135498 }, { "auxiliary_loss_clip": 0.0112295, "auxiliary_loss_mlp": 0.01003989, "balance_loss_clip": 1.02645481, "balance_loss_mlp": 0.99996012, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7881218553321822, "language_loss": 0.58797956, "learning_rate": 3.9594333464430145e-06, "loss": 0.60924894, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.2438137531280518 }, { "auxiliary_loss_clip": 0.01152644, "auxiliary_loss_mlp": 0.01036034, "balance_loss_clip": 1.0567801, "balance_loss_mlp": 1.02625251, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 1.8815795360444685, "language_loss": 0.87739921, "learning_rate": 3.959277101468709e-06, "loss": 0.89928591, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.6676254272460938 }, { "auxiliary_loss_clip": 0.01204624, "auxiliary_loss_mlp": 0.01041977, "balance_loss_clip": 1.06451154, "balance_loss_mlp": 1.03114712, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 4.300801006051527, "language_loss": 0.78533185, "learning_rate": 3.959120559273624e-06, "loss": 0.80779791, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.5300285816192627 }, { "auxiliary_loss_clip": 0.01205416, "auxiliary_loss_mlp": 0.01042755, "balance_loss_clip": 1.06745899, "balance_loss_mlp": 1.0328306, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 2.2510859508489998, "language_loss": 0.83523977, "learning_rate": 3.958963719881509e-06, "loss": 0.85772145, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 2.5514917373657227 }, { "auxiliary_loss_clip": 0.01236097, "auxiliary_loss_mlp": 0.01046992, "balance_loss_clip": 1.07132244, "balance_loss_mlp": 1.03644836, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 1.8922393242082698, "language_loss": 0.93970394, "learning_rate": 3.958806583316154e-06, "loss": 0.96253479, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 3.243525981903076 }, { "auxiliary_loss_clip": 0.01253902, "auxiliary_loss_mlp": 0.01039477, "balance_loss_clip": 1.07453108, "balance_loss_mlp": 1.02870607, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 1.8091859732670064, "language_loss": 0.78661937, "learning_rate": 3.9586491496013985e-06, "loss": 0.80955315, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 2.571927309036255 }, { "auxiliary_loss_clip": 0.01241072, "auxiliary_loss_mlp": 0.01045262, "balance_loss_clip": 1.07316315, "balance_loss_mlp": 1.03405607, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 2.5128235515169535, "language_loss": 0.83176446, "learning_rate": 3.958491418761124e-06, "loss": 0.85462779, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 3.197734832763672 }, { "auxiliary_loss_clip": 0.01220755, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.06334484, "balance_loss_mlp": 1.03562033, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 2.2136310057020534, "language_loss": 0.72889328, "learning_rate": 3.958333390819258e-06, "loss": 0.75156152, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 3.279985189437866 }, { "auxiliary_loss_clip": 0.01253735, "auxiliary_loss_mlp": 0.01050483, "balance_loss_clip": 1.07426572, "balance_loss_mlp": 1.04014206, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 2.2704975768591003, "language_loss": 0.80397034, "learning_rate": 3.9581750657997754e-06, "loss": 0.82701248, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 2.5226974487304688 }, { "auxiliary_loss_clip": 0.01217589, "auxiliary_loss_mlp": 0.0104137, "balance_loss_clip": 1.06566143, "balance_loss_mlp": 1.03093302, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 1.71273368053945, "language_loss": 0.89777064, "learning_rate": 3.95801644372669e-06, "loss": 0.92036021, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 3.375718116760254 }, { "auxiliary_loss_clip": 0.01225818, "auxiliary_loss_mlp": 0.01036286, "balance_loss_clip": 1.06525826, "balance_loss_mlp": 1.02547991, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 2.1268951007902577, "language_loss": 0.84312129, "learning_rate": 3.957857524624068e-06, "loss": 0.86574233, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.5608747005462646 }, { "auxiliary_loss_clip": 0.01220182, "auxiliary_loss_mlp": 0.01039036, "balance_loss_clip": 1.06708384, "balance_loss_mlp": 1.02873683, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.8082726127464892, "language_loss": 0.89375043, "learning_rate": 3.957698308516016e-06, "loss": 0.91634262, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.5842576026916504 }, { "auxiliary_loss_clip": 0.01235963, "auxiliary_loss_mlp": 0.0076357, "balance_loss_clip": 1.07534337, "balance_loss_mlp": 1.00003648, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 1.8862766899888983, "language_loss": 0.82688439, "learning_rate": 3.957538795426688e-06, "loss": 0.84687972, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.519973039627075 }, { "auxiliary_loss_clip": 0.01221954, "auxiliary_loss_mlp": 0.01051366, "balance_loss_clip": 1.06897378, "balance_loss_mlp": 1.04016054, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 2.126045240351721, "language_loss": 0.77135015, "learning_rate": 3.9573789853802804e-06, "loss": 0.79408336, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.566715955734253 }, { "auxiliary_loss_clip": 0.01220732, "auxiliary_loss_mlp": 0.00763744, "balance_loss_clip": 1.07116389, "balance_loss_mlp": 1.0000267, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 2.2818162487498217, "language_loss": 0.74777758, "learning_rate": 3.957218878401037e-06, "loss": 0.76762229, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.549079418182373 }, { "auxiliary_loss_clip": 0.01256979, "auxiliary_loss_mlp": 0.01047055, "balance_loss_clip": 1.07691932, "balance_loss_mlp": 1.03592086, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 3.0429568301400134, "language_loss": 0.8947165, "learning_rate": 3.957058474513246e-06, "loss": 0.9177568, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.55474853515625 }, { "auxiliary_loss_clip": 0.01234837, "auxiliary_loss_mlp": 0.01036693, "balance_loss_clip": 1.07162476, "balance_loss_mlp": 1.02605975, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 1.9127993878733849, "language_loss": 0.78336012, "learning_rate": 3.956897773741241e-06, "loss": 0.80607545, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.5585761070251465 }, { "auxiliary_loss_clip": 0.01207102, "auxiliary_loss_mlp": 0.01043029, "balance_loss_clip": 1.06444466, "balance_loss_mlp": 1.03208566, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 1.9038368676099195, "language_loss": 0.71648973, "learning_rate": 3.956736776109398e-06, "loss": 0.73899108, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.5801525115966797 }, { "auxiliary_loss_clip": 0.01228459, "auxiliary_loss_mlp": 0.00764391, "balance_loss_clip": 1.06875777, "balance_loss_mlp": 1.00002289, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 1.8991505290698107, "language_loss": 0.83676255, "learning_rate": 3.956575481642143e-06, "loss": 0.85669106, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.5168731212615967 }, { "auxiliary_loss_clip": 0.01185329, "auxiliary_loss_mlp": 0.01048422, "balance_loss_clip": 1.05971813, "balance_loss_mlp": 1.03713906, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.3696279173477497, "language_loss": 0.74984396, "learning_rate": 3.956413890363943e-06, "loss": 0.77218151, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.650292158126831 }, { "auxiliary_loss_clip": 0.01232435, "auxiliary_loss_mlp": 0.01042487, "balance_loss_clip": 1.0694437, "balance_loss_mlp": 1.03105497, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 2.190885896853387, "language_loss": 0.81679285, "learning_rate": 3.956252002299312e-06, "loss": 0.83954203, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.4715182781219482 }, { "auxiliary_loss_clip": 0.01256254, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.07587671, "balance_loss_mlp": 1.02636659, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 2.241285256299441, "language_loss": 0.9113155, "learning_rate": 3.956089817472807e-06, "loss": 0.93425524, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.472564220428467 }, { "auxiliary_loss_clip": 0.01220526, "auxiliary_loss_mlp": 0.01042568, "balance_loss_clip": 1.07113636, "balance_loss_mlp": 1.03087389, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 2.6620955090165017, "language_loss": 0.85394228, "learning_rate": 3.955927335909032e-06, "loss": 0.8765732, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.6445605754852295 }, { "auxiliary_loss_clip": 0.01190948, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.06897473, "balance_loss_mlp": 1.02177989, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 2.2118340729420627, "language_loss": 0.75587887, "learning_rate": 3.955764557632634e-06, "loss": 0.77810836, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.67232346534729 }, { "auxiliary_loss_clip": 0.01215823, "auxiliary_loss_mlp": 0.01041025, "balance_loss_clip": 1.06738222, "balance_loss_mlp": 1.03064179, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 2.2210295547720147, "language_loss": 0.94729877, "learning_rate": 3.955601482668309e-06, "loss": 0.96986729, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.539994478225708 }, { "auxiliary_loss_clip": 0.01181042, "auxiliary_loss_mlp": 0.01040544, "balance_loss_clip": 1.05739176, "balance_loss_mlp": 1.02895117, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 1.865288579395569, "language_loss": 0.88236117, "learning_rate": 3.955438111040794e-06, "loss": 0.90457702, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.5963032245635986 }, { "auxiliary_loss_clip": 0.01184178, "auxiliary_loss_mlp": 0.0103867, "balance_loss_clip": 1.06101346, "balance_loss_mlp": 1.02795255, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 1.910129269260517, "language_loss": 0.80285174, "learning_rate": 3.955274442774873e-06, "loss": 0.82508016, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.619797706604004 }, { "auxiliary_loss_clip": 0.01234076, "auxiliary_loss_mlp": 0.01043207, "balance_loss_clip": 1.0691874, "balance_loss_mlp": 1.03223348, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 2.6095774917638, "language_loss": 0.70466709, "learning_rate": 3.9551104778953725e-06, "loss": 0.72743988, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.5989956855773926 }, { "auxiliary_loss_clip": 0.01202046, "auxiliary_loss_mlp": 0.01037789, "balance_loss_clip": 1.06345677, "balance_loss_mlp": 1.02704215, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 2.118252157640684, "language_loss": 0.85165131, "learning_rate": 3.954946216427167e-06, "loss": 0.87404966, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.58160400390625 }, { "auxiliary_loss_clip": 0.01100601, "auxiliary_loss_mlp": 0.01007093, "balance_loss_clip": 1.03456044, "balance_loss_mlp": 1.00294447, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.8764931478142052, "language_loss": 0.61545932, "learning_rate": 3.954781658395176e-06, "loss": 0.63653624, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 3.122253179550171 }, { "auxiliary_loss_clip": 0.01228035, "auxiliary_loss_mlp": 0.01046699, "balance_loss_clip": 1.06754375, "balance_loss_mlp": 1.03571939, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 1.9217903881254186, "language_loss": 0.92560965, "learning_rate": 3.95461680382436e-06, "loss": 0.94835699, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 3.3956949710845947 }, { "auxiliary_loss_clip": 0.01241824, "auxiliary_loss_mlp": 0.01049435, "balance_loss_clip": 1.07592916, "balance_loss_mlp": 1.03788948, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 2.7872625104505433, "language_loss": 0.85986185, "learning_rate": 3.9544516527397295e-06, "loss": 0.88277447, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 2.528494358062744 }, { "auxiliary_loss_clip": 0.01203247, "auxiliary_loss_mlp": 0.01042529, "balance_loss_clip": 1.06584811, "balance_loss_mlp": 1.03190184, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 1.6457009212158078, "language_loss": 0.80850255, "learning_rate": 3.954286205166338e-06, "loss": 0.83096021, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 3.3440635204315186 }, { "auxiliary_loss_clip": 0.01241329, "auxiliary_loss_mlp": 0.01049898, "balance_loss_clip": 1.07410145, "balance_loss_mlp": 1.03837669, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 2.185856652974379, "language_loss": 0.8390227, "learning_rate": 3.954120461129282e-06, "loss": 0.86193496, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 3.180696725845337 }, { "auxiliary_loss_clip": 0.01255802, "auxiliary_loss_mlp": 0.01042073, "balance_loss_clip": 1.07748771, "balance_loss_mlp": 1.03102839, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 1.9157662034450935, "language_loss": 0.83700061, "learning_rate": 3.953954420653706e-06, "loss": 0.85997933, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 2.499013900756836 }, { "auxiliary_loss_clip": 0.01238333, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.07250559, "balance_loss_mlp": 1.03428078, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 1.8602508089375915, "language_loss": 0.8800419, "learning_rate": 3.953788083764798e-06, "loss": 0.90287924, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 3.3925650119781494 }, { "auxiliary_loss_clip": 0.01189491, "auxiliary_loss_mlp": 0.01046537, "balance_loss_clip": 1.06865215, "balance_loss_mlp": 1.035254, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 2.1780272973575436, "language_loss": 0.92036736, "learning_rate": 3.953621450487792e-06, "loss": 0.94272763, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.5810434818267822 }, { "auxiliary_loss_clip": 0.0113568, "auxiliary_loss_mlp": 0.0100758, "balance_loss_clip": 1.03099465, "balance_loss_mlp": 1.00357425, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 0.8380963686147276, "language_loss": 0.61251444, "learning_rate": 3.953454520847964e-06, "loss": 0.63394701, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.2256829738616943 }, { "auxiliary_loss_clip": 0.01216512, "auxiliary_loss_mlp": 0.01040129, "balance_loss_clip": 1.06671476, "balance_loss_mlp": 1.02760649, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 1.8947488758438518, "language_loss": 0.73259622, "learning_rate": 3.9532872948706395e-06, "loss": 0.75516266, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.556593418121338 }, { "auxiliary_loss_clip": 0.01223589, "auxiliary_loss_mlp": 0.0104746, "balance_loss_clip": 1.06952691, "balance_loss_mlp": 1.03556871, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 2.367058132327071, "language_loss": 0.82585919, "learning_rate": 3.9531197725811845e-06, "loss": 0.84856969, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.523057460784912 }, { "auxiliary_loss_clip": 0.01254588, "auxiliary_loss_mlp": 0.01044931, "balance_loss_clip": 1.07643676, "balance_loss_mlp": 1.0345118, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 1.83577378244001, "language_loss": 0.87803149, "learning_rate": 3.952951954005013e-06, "loss": 0.90102673, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.517145872116089 }, { "auxiliary_loss_clip": 0.01220373, "auxiliary_loss_mlp": 0.01035711, "balance_loss_clip": 1.06493044, "balance_loss_mlp": 1.02553701, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 1.921277095838438, "language_loss": 0.84816921, "learning_rate": 3.952783839167584e-06, "loss": 0.87073004, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.5988826751708984 }, { "auxiliary_loss_clip": 0.01234636, "auxiliary_loss_mlp": 0.01051067, "balance_loss_clip": 1.0695436, "balance_loss_mlp": 1.03924155, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 2.534639199745739, "language_loss": 0.74351418, "learning_rate": 3.952615428094398e-06, "loss": 0.76637125, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.537630796432495 }, { "auxiliary_loss_clip": 0.0117562, "auxiliary_loss_mlp": 0.01050176, "balance_loss_clip": 1.05844605, "balance_loss_mlp": 1.03914952, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 1.6978351886828218, "language_loss": 0.73558933, "learning_rate": 3.952446720811004e-06, "loss": 0.75784731, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.597297430038452 }, { "auxiliary_loss_clip": 0.01092532, "auxiliary_loss_mlp": 0.01005437, "balance_loss_clip": 1.02486873, "balance_loss_mlp": 1.00121737, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.8335617792241169, "language_loss": 0.63605231, "learning_rate": 3.952277717342995e-06, "loss": 0.65703207, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.269383430480957 }, { "auxiliary_loss_clip": 0.0122565, "auxiliary_loss_mlp": 0.01044378, "balance_loss_clip": 1.06866503, "balance_loss_mlp": 1.0333811, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 3.0227241200342463, "language_loss": 0.85668766, "learning_rate": 3.952108417716009e-06, "loss": 0.87938792, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.5668752193450928 }, { "auxiliary_loss_clip": 0.01240569, "auxiliary_loss_mlp": 0.0106392, "balance_loss_clip": 1.07471824, "balance_loss_mlp": 1.05230927, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 1.8641025883405788, "language_loss": 0.85131419, "learning_rate": 3.951938821955727e-06, "loss": 0.87435913, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.557171583175659 }, { "auxiliary_loss_clip": 0.01221039, "auxiliary_loss_mlp": 0.01041716, "balance_loss_clip": 1.06949186, "balance_loss_mlp": 1.03011131, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 1.5485501188567774, "language_loss": 0.76628995, "learning_rate": 3.9517689300878786e-06, "loss": 0.78891754, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.569112777709961 }, { "auxiliary_loss_clip": 0.01253196, "auxiliary_loss_mlp": 0.01048904, "balance_loss_clip": 1.07310891, "balance_loss_mlp": 1.03734076, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 1.8074400912840598, "language_loss": 0.78495026, "learning_rate": 3.951598742138236e-06, "loss": 0.80797124, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.506138801574707 }, { "auxiliary_loss_clip": 0.01219645, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.06345177, "balance_loss_mlp": 1.03365922, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.04210868199264, "language_loss": 0.80102754, "learning_rate": 3.951428258132615e-06, "loss": 0.82367361, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.5632827281951904 }, { "auxiliary_loss_clip": 0.01225501, "auxiliary_loss_mlp": 0.010524, "balance_loss_clip": 1.07161558, "balance_loss_mlp": 1.04062831, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 1.90896095399811, "language_loss": 0.84509647, "learning_rate": 3.951257478096879e-06, "loss": 0.86787546, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.5612056255340576 }, { "auxiliary_loss_clip": 0.01224451, "auxiliary_loss_mlp": 0.00764592, "balance_loss_clip": 1.06964374, "balance_loss_mlp": 1.00006032, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 2.819386655306629, "language_loss": 0.68649983, "learning_rate": 3.951086402056936e-06, "loss": 0.70639026, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.5498387813568115 }, { "auxiliary_loss_clip": 0.01157734, "auxiliary_loss_mlp": 0.00764563, "balance_loss_clip": 1.06728888, "balance_loss_mlp": 1.00007248, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.6078236189118822, "language_loss": 0.83438742, "learning_rate": 3.950915030038735e-06, "loss": 0.85361034, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 2.839092254638672 }, { "auxiliary_loss_clip": 0.0123058, "auxiliary_loss_mlp": 0.01043523, "balance_loss_clip": 1.06996238, "balance_loss_mlp": 1.03266346, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.1801582473989582, "language_loss": 0.83534747, "learning_rate": 3.9507433620682765e-06, "loss": 0.85808855, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.874157190322876 }, { "auxiliary_loss_clip": 0.01203699, "auxiliary_loss_mlp": 0.01049083, "balance_loss_clip": 1.06358516, "balance_loss_mlp": 1.03771102, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 1.656322871188082, "language_loss": 0.88049924, "learning_rate": 3.9505713981716e-06, "loss": 0.90302706, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.6769068241119385 }, { "auxiliary_loss_clip": 0.01220481, "auxiliary_loss_mlp": 0.01042954, "balance_loss_clip": 1.07116115, "balance_loss_mlp": 1.03226042, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 1.813834112273762, "language_loss": 0.81190968, "learning_rate": 3.950399138374795e-06, "loss": 0.83454406, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 2.558539867401123 }, { "auxiliary_loss_clip": 0.01231509, "auxiliary_loss_mlp": 0.01044375, "balance_loss_clip": 1.06714046, "balance_loss_mlp": 1.03309226, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 1.6827354318034067, "language_loss": 0.74400908, "learning_rate": 3.95022658270399e-06, "loss": 0.76676798, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 3.3720877170562744 }, { "auxiliary_loss_clip": 0.01213241, "auxiliary_loss_mlp": 0.01041649, "balance_loss_clip": 1.06730986, "balance_loss_mlp": 1.03057492, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 1.9034350792718542, "language_loss": 0.781335, "learning_rate": 3.9500537311853635e-06, "loss": 0.80388391, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 2.51812744140625 }, { "auxiliary_loss_clip": 0.0123678, "auxiliary_loss_mlp": 0.01043087, "balance_loss_clip": 1.0682919, "balance_loss_mlp": 1.03160167, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 3.035083570808106, "language_loss": 0.83366144, "learning_rate": 3.949880583845136e-06, "loss": 0.85646009, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 2.545421838760376 }, { "auxiliary_loss_clip": 0.01218963, "auxiliary_loss_mlp": 0.01046129, "balance_loss_clip": 1.06821251, "balance_loss_mlp": 1.03497672, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 1.9705324209820514, "language_loss": 0.81547451, "learning_rate": 3.949707140709575e-06, "loss": 0.83812535, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 3.319261074066162 }, { "auxiliary_loss_clip": 0.01234948, "auxiliary_loss_mlp": 0.01039722, "balance_loss_clip": 1.06725454, "balance_loss_mlp": 1.02941024, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 2.171570872830343, "language_loss": 0.83059132, "learning_rate": 3.949533401804991e-06, "loss": 0.853338, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 2.4865283966064453 }, { "auxiliary_loss_clip": 0.01231718, "auxiliary_loss_mlp": 0.00764567, "balance_loss_clip": 1.07041526, "balance_loss_mlp": 1.00004935, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 1.8813725245447654, "language_loss": 0.90611756, "learning_rate": 3.949359367157739e-06, "loss": 0.92608047, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 3.3413572311401367 }, { "auxiliary_loss_clip": 0.01237427, "auxiliary_loss_mlp": 0.01043575, "balance_loss_clip": 1.06993675, "balance_loss_mlp": 1.03221393, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 2.305719481196296, "language_loss": 0.75660098, "learning_rate": 3.949185036794222e-06, "loss": 0.77941096, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.5150344371795654 }, { "auxiliary_loss_clip": 0.01248247, "auxiliary_loss_mlp": 0.01043081, "balance_loss_clip": 1.07239449, "balance_loss_mlp": 1.03232801, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.6409624883503955, "language_loss": 0.78551722, "learning_rate": 3.949010410740884e-06, "loss": 0.80843049, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.5663540363311768 }, { "auxiliary_loss_clip": 0.01212952, "auxiliary_loss_mlp": 0.00764552, "balance_loss_clip": 1.06834388, "balance_loss_mlp": 1.00009584, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.6188494846926345, "language_loss": 0.86455977, "learning_rate": 3.948835489024216e-06, "loss": 0.8843348, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.564117908477783 }, { "auxiliary_loss_clip": 0.01235673, "auxiliary_loss_mlp": 0.01040208, "balance_loss_clip": 1.06813419, "balance_loss_mlp": 1.02934837, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 2.2297956637223053, "language_loss": 0.90384543, "learning_rate": 3.948660271670755e-06, "loss": 0.92660421, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.525570869445801 }, { "auxiliary_loss_clip": 0.01216253, "auxiliary_loss_mlp": 0.01045737, "balance_loss_clip": 1.06751299, "balance_loss_mlp": 1.03510916, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 2.0975291970970646, "language_loss": 0.84480757, "learning_rate": 3.948484758707079e-06, "loss": 0.86742747, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.5966455936431885 }, { "auxiliary_loss_clip": 0.01193094, "auxiliary_loss_mlp": 0.01042407, "balance_loss_clip": 1.06019402, "balance_loss_mlp": 1.03067732, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 2.0389344336043758, "language_loss": 0.83610398, "learning_rate": 3.948308950159815e-06, "loss": 0.858459, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.60986065864563 }, { "auxiliary_loss_clip": 0.01194724, "auxiliary_loss_mlp": 0.01045499, "balance_loss_clip": 1.05928612, "balance_loss_mlp": 1.03375053, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 2.469373135588881, "language_loss": 0.75759947, "learning_rate": 3.9481328460556326e-06, "loss": 0.78000164, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.5570383071899414 }, { "auxiliary_loss_clip": 0.01211864, "auxiliary_loss_mlp": 0.01045634, "balance_loss_clip": 1.06635928, "balance_loss_mlp": 1.03386247, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.380700869328185, "language_loss": 0.89809179, "learning_rate": 3.9479564464212455e-06, "loss": 0.92066669, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.553717613220215 }, { "auxiliary_loss_clip": 0.01254779, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.07200336, "balance_loss_mlp": 1.03477883, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 2.3042279231750027, "language_loss": 0.76076603, "learning_rate": 3.947779751283414e-06, "loss": 0.78377998, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.474430561065674 }, { "auxiliary_loss_clip": 0.01234569, "auxiliary_loss_mlp": 0.00765068, "balance_loss_clip": 1.07267404, "balance_loss_mlp": 1.00010502, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 1.7998301165170738, "language_loss": 0.75888979, "learning_rate": 3.947602760668944e-06, "loss": 0.77888614, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.532369613647461 }, { "auxiliary_loss_clip": 0.01237049, "auxiliary_loss_mlp": 0.01049751, "balance_loss_clip": 1.07411981, "balance_loss_mlp": 1.03800297, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 1.789496284111264, "language_loss": 0.71289456, "learning_rate": 3.947425474604684e-06, "loss": 0.7357626, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.6679022312164307 }, { "auxiliary_loss_clip": 0.01220094, "auxiliary_loss_mlp": 0.01050517, "balance_loss_clip": 1.06744123, "balance_loss_mlp": 1.03904939, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 1.9690073978246612, "language_loss": 0.92514485, "learning_rate": 3.947247893117528e-06, "loss": 0.947851, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.5698113441467285 }, { "auxiliary_loss_clip": 0.01231355, "auxiliary_loss_mlp": 0.01043307, "balance_loss_clip": 1.0684793, "balance_loss_mlp": 1.03178513, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 3.3586695482244173, "language_loss": 0.69828212, "learning_rate": 3.947070016234413e-06, "loss": 0.7210288, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.489269971847534 }, { "auxiliary_loss_clip": 0.01231674, "auxiliary_loss_mlp": 0.01042418, "balance_loss_clip": 1.07137299, "balance_loss_mlp": 1.03138518, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.3384660085368556, "language_loss": 0.74703544, "learning_rate": 3.946891843982326e-06, "loss": 0.76977634, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.5321874618530273 }, { "auxiliary_loss_clip": 0.01236888, "auxiliary_loss_mlp": 0.01049568, "balance_loss_clip": 1.0729481, "balance_loss_mlp": 1.03790367, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 2.0456967242406976, "language_loss": 0.74558693, "learning_rate": 3.9467133763882935e-06, "loss": 0.76845151, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.509788990020752 }, { "auxiliary_loss_clip": 0.01221596, "auxiliary_loss_mlp": 0.01045163, "balance_loss_clip": 1.06753206, "balance_loss_mlp": 1.03402841, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 2.1092080766765533, "language_loss": 0.86170149, "learning_rate": 3.9465346134793905e-06, "loss": 0.88436902, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.541602373123169 }, { "auxiliary_loss_clip": 0.01208352, "auxiliary_loss_mlp": 0.01046795, "balance_loss_clip": 1.06977129, "balance_loss_mlp": 1.03545833, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 2.7457940612750606, "language_loss": 0.79737431, "learning_rate": 3.9463555552827335e-06, "loss": 0.81992579, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.6576967239379883 }, { "auxiliary_loss_clip": 0.01218392, "auxiliary_loss_mlp": 0.01039557, "balance_loss_clip": 1.0660125, "balance_loss_mlp": 1.02819633, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 2.4021835581696473, "language_loss": 0.86235654, "learning_rate": 3.946176201825487e-06, "loss": 0.88493598, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.539459705352783 }, { "auxiliary_loss_clip": 0.01222907, "auxiliary_loss_mlp": 0.01048764, "balance_loss_clip": 1.07386994, "balance_loss_mlp": 1.03685498, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 1.8273498981741665, "language_loss": 0.83515042, "learning_rate": 3.9459965531348575e-06, "loss": 0.85786712, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.589956283569336 }, { "auxiliary_loss_clip": 0.01221897, "auxiliary_loss_mlp": 0.0076538, "balance_loss_clip": 1.0712949, "balance_loss_mlp": 1.00008512, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.7249732804356817, "language_loss": 0.85981137, "learning_rate": 3.945816609238098e-06, "loss": 0.87968409, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.6345951557159424 }, { "auxiliary_loss_clip": 0.01181141, "auxiliary_loss_mlp": 0.01041118, "balance_loss_clip": 1.06556726, "balance_loss_mlp": 1.02880394, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 1.7634513737873236, "language_loss": 0.85207468, "learning_rate": 3.945636370162507e-06, "loss": 0.87429726, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 3.4216551780700684 }, { "auxiliary_loss_clip": 0.01232757, "auxiliary_loss_mlp": 0.01047381, "balance_loss_clip": 1.06991184, "balance_loss_mlp": 1.03658044, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 1.7711195479214523, "language_loss": 0.79016912, "learning_rate": 3.945455835935425e-06, "loss": 0.81297052, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 2.5623202323913574 }, { "auxiliary_loss_clip": 0.01220644, "auxiliary_loss_mlp": 0.01052154, "balance_loss_clip": 1.0691905, "balance_loss_mlp": 1.04039359, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 3.1000149717364254, "language_loss": 0.75237119, "learning_rate": 3.94527500658424e-06, "loss": 0.77509916, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 2.563441753387451 }, { "auxiliary_loss_clip": 0.0118852, "auxiliary_loss_mlp": 0.01045565, "balance_loss_clip": 1.06686306, "balance_loss_mlp": 1.03462195, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 1.8450839460594908, "language_loss": 0.81124562, "learning_rate": 3.945093882136382e-06, "loss": 0.83358651, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 4.168609380722046 }, { "auxiliary_loss_clip": 0.01219982, "auxiliary_loss_mlp": 0.00764201, "balance_loss_clip": 1.0710516, "balance_loss_mlp": 1.00010061, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 1.776553789906378, "language_loss": 0.84672195, "learning_rate": 3.944912462619329e-06, "loss": 0.8665638, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 2.5688798427581787 }, { "auxiliary_loss_clip": 0.01230176, "auxiliary_loss_mlp": 0.01046636, "balance_loss_clip": 1.07111144, "balance_loss_mlp": 1.03441656, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 1.9063932291408048, "language_loss": 0.8047682, "learning_rate": 3.9447307480606025e-06, "loss": 0.82753623, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 3.4602725505828857 }, { "auxiliary_loss_clip": 0.01211739, "auxiliary_loss_mlp": 0.01049356, "balance_loss_clip": 1.06651807, "balance_loss_mlp": 1.03835845, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 2.3858885467876187, "language_loss": 0.9018116, "learning_rate": 3.944548738487767e-06, "loss": 0.92442256, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.51202392578125 }, { "auxiliary_loss_clip": 0.01259678, "auxiliary_loss_mlp": 0.01047372, "balance_loss_clip": 1.07812047, "balance_loss_mlp": 1.03537333, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 2.1267893489824714, "language_loss": 0.90995711, "learning_rate": 3.944366433928434e-06, "loss": 0.93302757, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.554260492324829 }, { "auxiliary_loss_clip": 0.01215116, "auxiliary_loss_mlp": 0.01039843, "balance_loss_clip": 1.06776571, "balance_loss_mlp": 1.02997196, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 1.586471419624451, "language_loss": 0.83529651, "learning_rate": 3.9441838344102594e-06, "loss": 0.85784614, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.5728750228881836 }, { "auxiliary_loss_clip": 0.01228881, "auxiliary_loss_mlp": 0.01043033, "balance_loss_clip": 1.07268262, "balance_loss_mlp": 1.03199422, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 2.1778685821263166, "language_loss": 0.67390478, "learning_rate": 3.944000939960943e-06, "loss": 0.69662392, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.5609171390533447 }, { "auxiliary_loss_clip": 0.01238206, "auxiliary_loss_mlp": 0.01044837, "balance_loss_clip": 1.07038188, "balance_loss_mlp": 1.03484142, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 1.9092474364842906, "language_loss": 0.79969895, "learning_rate": 3.943817750608229e-06, "loss": 0.82252932, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.5937371253967285 }, { "auxiliary_loss_clip": 0.01239797, "auxiliary_loss_mlp": 0.01039472, "balance_loss_clip": 1.07263398, "balance_loss_mlp": 1.02843904, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 2.362996196133597, "language_loss": 0.82070845, "learning_rate": 3.943634266379908e-06, "loss": 0.84350115, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.493540048599243 }, { "auxiliary_loss_clip": 0.01241326, "auxiliary_loss_mlp": 0.01048773, "balance_loss_clip": 1.07421124, "balance_loss_mlp": 1.03800273, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 1.7361210071618254, "language_loss": 0.84764922, "learning_rate": 3.943450487303815e-06, "loss": 0.87055027, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.5528788566589355 }, { "auxiliary_loss_clip": 0.01237558, "auxiliary_loss_mlp": 0.01043617, "balance_loss_clip": 1.07397807, "balance_loss_mlp": 1.0324589, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 2.855585656488234, "language_loss": 0.8536123, "learning_rate": 3.943266413407827e-06, "loss": 0.87642407, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.522669792175293 }, { "auxiliary_loss_clip": 0.01244036, "auxiliary_loss_mlp": 0.01048749, "balance_loss_clip": 1.07534027, "balance_loss_mlp": 1.03844345, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 4.461103437516621, "language_loss": 0.84993929, "learning_rate": 3.94308204471987e-06, "loss": 0.87286711, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.549011707305908 }, { "auxiliary_loss_clip": 0.01205562, "auxiliary_loss_mlp": 0.01043485, "balance_loss_clip": 1.06759715, "balance_loss_mlp": 1.0333519, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 2.2152117553880437, "language_loss": 0.74848735, "learning_rate": 3.942897381267912e-06, "loss": 0.77097785, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.5553743839263916 }, { "auxiliary_loss_clip": 0.01237557, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.07303452, "balance_loss_mlp": 1.02787161, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 2.461889756001147, "language_loss": 0.66259277, "learning_rate": 3.942712423079965e-06, "loss": 0.68537027, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.4869585037231445 }, { "auxiliary_loss_clip": 0.01184215, "auxiliary_loss_mlp": 0.01042543, "balance_loss_clip": 1.05818617, "balance_loss_mlp": 1.03173053, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.220438157546921, "language_loss": 0.90240145, "learning_rate": 3.942527170184088e-06, "loss": 0.92466897, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.5531749725341797 }, { "auxiliary_loss_clip": 0.01257947, "auxiliary_loss_mlp": 0.01042323, "balance_loss_clip": 1.07668388, "balance_loss_mlp": 1.03137374, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.5430816714810547, "language_loss": 0.77735454, "learning_rate": 3.942341622608385e-06, "loss": 0.80035722, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.461937189102173 }, { "auxiliary_loss_clip": 0.01226418, "auxiliary_loss_mlp": 0.01050803, "balance_loss_clip": 1.07539332, "balance_loss_mlp": 1.04006827, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 1.5690366864774377, "language_loss": 0.77826023, "learning_rate": 3.942155780381001e-06, "loss": 0.80103242, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.6916921138763428 }, { "auxiliary_loss_clip": 0.01224249, "auxiliary_loss_mlp": 0.01054246, "balance_loss_clip": 1.071015, "balance_loss_mlp": 1.0421406, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 1.9178861344789284, "language_loss": 0.75863862, "learning_rate": 3.94196964353013e-06, "loss": 0.78142357, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.5819168090820312 }, { "auxiliary_loss_clip": 0.01218292, "auxiliary_loss_mlp": 0.00764063, "balance_loss_clip": 1.06760359, "balance_loss_mlp": 1.00012302, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 2.4057577085901953, "language_loss": 0.80807567, "learning_rate": 3.941783212084008e-06, "loss": 0.82789922, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.560265064239502 }, { "auxiliary_loss_clip": 0.01201534, "auxiliary_loss_mlp": 0.01059156, "balance_loss_clip": 1.06629157, "balance_loss_mlp": 1.04836202, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 2.687663357540943, "language_loss": 0.78517759, "learning_rate": 3.941596486070916e-06, "loss": 0.80778456, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.6941590309143066 }, { "auxiliary_loss_clip": 0.01189214, "auxiliary_loss_mlp": 0.0104212, "balance_loss_clip": 1.07059467, "balance_loss_mlp": 1.03034759, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 4.50179509434207, "language_loss": 0.58819473, "learning_rate": 3.941409465519182e-06, "loss": 0.61050802, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.6703388690948486 }, { "auxiliary_loss_clip": 0.01227285, "auxiliary_loss_mlp": 0.01050409, "balance_loss_clip": 1.06888366, "balance_loss_mlp": 1.03920925, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.6998899501423081, "language_loss": 0.85299563, "learning_rate": 3.941222150457176e-06, "loss": 0.8757726, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 2.6571452617645264 }, { "auxiliary_loss_clip": 0.01241144, "auxiliary_loss_mlp": 0.01047844, "balance_loss_clip": 1.07096815, "balance_loss_mlp": 1.0365665, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 2.928694923809926, "language_loss": 0.71964639, "learning_rate": 3.941034540913311e-06, "loss": 0.74253625, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 2.5020735263824463 }, { "auxiliary_loss_clip": 0.0123636, "auxiliary_loss_mlp": 0.00765104, "balance_loss_clip": 1.07246995, "balance_loss_mlp": 1.00009084, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.6717632620221499, "language_loss": 0.82586408, "learning_rate": 3.940846636916051e-06, "loss": 0.84587872, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 3.2996864318847656 }, { "auxiliary_loss_clip": 0.01213933, "auxiliary_loss_mlp": 0.01049704, "balance_loss_clip": 1.07016122, "balance_loss_mlp": 1.03865361, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 2.10363794919024, "language_loss": 0.8630811, "learning_rate": 3.940658438493899e-06, "loss": 0.88571751, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 2.541696071624756 }, { "auxiliary_loss_clip": 0.01249614, "auxiliary_loss_mlp": 0.01045978, "balance_loss_clip": 1.06987524, "balance_loss_mlp": 1.03397381, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 2.7504534556730604, "language_loss": 0.76005197, "learning_rate": 3.940469945675405e-06, "loss": 0.78300786, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 2.485853672027588 }, { "auxiliary_loss_clip": 0.01164104, "auxiliary_loss_mlp": 0.01038212, "balance_loss_clip": 1.06195509, "balance_loss_mlp": 1.0279485, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 10.8469812207574, "language_loss": 0.91346717, "learning_rate": 3.940281158489163e-06, "loss": 0.93549031, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 3.4364988803863525 }, { "auxiliary_loss_clip": 0.01166412, "auxiliary_loss_mlp": 0.01044486, "balance_loss_clip": 1.05651855, "balance_loss_mlp": 1.03283882, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 1.6466070118151486, "language_loss": 0.82759571, "learning_rate": 3.940092076963812e-06, "loss": 0.84970474, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 2.651867151260376 }, { "auxiliary_loss_clip": 0.01215478, "auxiliary_loss_mlp": 0.01049756, "balance_loss_clip": 1.06338549, "balance_loss_mlp": 1.03774536, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.596769395717113, "language_loss": 0.78943717, "learning_rate": 3.9399027011280355e-06, "loss": 0.81208956, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 3.5031917095184326 }, { "auxiliary_loss_clip": 0.01223363, "auxiliary_loss_mlp": 0.01046229, "balance_loss_clip": 1.0750134, "balance_loss_mlp": 1.03425407, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 1.9741042812167036, "language_loss": 0.77013195, "learning_rate": 3.939713031010561e-06, "loss": 0.7928279, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.5636136531829834 }, { "auxiliary_loss_clip": 0.01203804, "auxiliary_loss_mlp": 0.01044784, "balance_loss_clip": 1.06870294, "balance_loss_mlp": 1.03313148, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 2.059907967737452, "language_loss": 0.77541101, "learning_rate": 3.939523066640163e-06, "loss": 0.79789686, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.589808225631714 }, { "auxiliary_loss_clip": 0.01234299, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.07017171, "balance_loss_mlp": 1.02770185, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 1.839702999152693, "language_loss": 0.81147206, "learning_rate": 3.939332808045657e-06, "loss": 0.83420056, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.563209295272827 }, { "auxiliary_loss_clip": 0.01204016, "auxiliary_loss_mlp": 0.01043246, "balance_loss_clip": 1.0670042, "balance_loss_mlp": 1.03236246, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 2.3814567433771683, "language_loss": 0.84671998, "learning_rate": 3.939142255255906e-06, "loss": 0.8691926, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.611672878265381 }, { "auxiliary_loss_clip": 0.01238769, "auxiliary_loss_mlp": 0.01041848, "balance_loss_clip": 1.07610118, "balance_loss_mlp": 1.03083873, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 1.773633803805175, "language_loss": 0.86951971, "learning_rate": 3.938951408299817e-06, "loss": 0.89232588, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.534438133239746 }, { "auxiliary_loss_clip": 0.01097505, "auxiliary_loss_mlp": 0.01013407, "balance_loss_clip": 1.04948783, "balance_loss_mlp": 1.01011682, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.8029375908190648, "language_loss": 0.54455644, "learning_rate": 3.938760267206342e-06, "loss": 0.56566554, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.328860282897949 }, { "auxiliary_loss_clip": 0.01249148, "auxiliary_loss_mlp": 0.01046916, "balance_loss_clip": 1.0725491, "balance_loss_mlp": 1.0360682, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.333280005403275, "language_loss": 0.78506947, "learning_rate": 3.938568832004475e-06, "loss": 0.80803013, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.669353723526001 }, { "auxiliary_loss_clip": 0.01208806, "auxiliary_loss_mlp": 0.01043293, "balance_loss_clip": 1.06697381, "balance_loss_mlp": 1.03275442, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 1.9262929940585052, "language_loss": 0.75506639, "learning_rate": 3.938377102723257e-06, "loss": 0.77758735, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.550930976867676 }, { "auxiliary_loss_clip": 0.01168247, "auxiliary_loss_mlp": 0.01046813, "balance_loss_clip": 1.05949044, "balance_loss_mlp": 1.03487372, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 2.0025631197032467, "language_loss": 0.83228242, "learning_rate": 3.938185079391774e-06, "loss": 0.85443306, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.643177032470703 }, { "auxiliary_loss_clip": 0.01248805, "auxiliary_loss_mlp": 0.01038635, "balance_loss_clip": 1.0724628, "balance_loss_mlp": 1.02796006, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 3.5349919053328205, "language_loss": 1.0572294, "learning_rate": 3.937992762039157e-06, "loss": 1.08010375, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.474825143814087 }, { "auxiliary_loss_clip": 0.01231377, "auxiliary_loss_mlp": 0.01046879, "balance_loss_clip": 1.0717119, "balance_loss_mlp": 1.03645396, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 1.6677336179866498, "language_loss": 0.80502594, "learning_rate": 3.937800150694577e-06, "loss": 0.8278085, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.552788019180298 }, { "auxiliary_loss_clip": 0.01191222, "auxiliary_loss_mlp": 0.01053589, "balance_loss_clip": 1.06718135, "balance_loss_mlp": 1.04247904, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.1736515174268383, "language_loss": 0.75921512, "learning_rate": 3.937607245387255e-06, "loss": 0.7816633, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.647382974624634 }, { "auxiliary_loss_clip": 0.01222079, "auxiliary_loss_mlp": 0.01048056, "balance_loss_clip": 1.06548119, "balance_loss_mlp": 1.03822136, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 2.1539651830437845, "language_loss": 0.72253096, "learning_rate": 3.937414046146455e-06, "loss": 0.74523234, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.5574097633361816 }, { "auxiliary_loss_clip": 0.01250393, "auxiliary_loss_mlp": 0.01057096, "balance_loss_clip": 1.07312894, "balance_loss_mlp": 1.04552054, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 2.4015089977165314, "language_loss": 0.75854194, "learning_rate": 3.9372205530014845e-06, "loss": 0.78161681, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.5158603191375732 }, { "auxiliary_loss_clip": 0.01250407, "auxiliary_loss_mlp": 0.01047045, "balance_loss_clip": 1.07064474, "balance_loss_mlp": 1.03666782, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 1.9520468563148006, "language_loss": 0.71447635, "learning_rate": 3.937026765981696e-06, "loss": 0.7374509, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.5126612186431885 }, { "auxiliary_loss_clip": 0.01206343, "auxiliary_loss_mlp": 0.010473, "balance_loss_clip": 1.06934905, "balance_loss_mlp": 1.03596282, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 2.4407745960401113, "language_loss": 0.79492223, "learning_rate": 3.936832685116488e-06, "loss": 0.81745869, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.5977816581726074 }, { "auxiliary_loss_clip": 0.01248, "auxiliary_loss_mlp": 0.01035667, "balance_loss_clip": 1.07149601, "balance_loss_mlp": 1.02475977, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 2.028315717356634, "language_loss": 0.89903009, "learning_rate": 3.936638310435301e-06, "loss": 0.92186677, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.4515583515167236 }, { "auxiliary_loss_clip": 0.01239838, "auxiliary_loss_mlp": 0.0103736, "balance_loss_clip": 1.07089341, "balance_loss_mlp": 1.02591038, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 1.7471263731748448, "language_loss": 0.81238055, "learning_rate": 3.936443641967623e-06, "loss": 0.83515257, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.5281920433044434 }, { "auxiliary_loss_clip": 0.01223425, "auxiliary_loss_mlp": 0.01043294, "balance_loss_clip": 1.07012331, "balance_loss_mlp": 1.03245139, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 2.3373922167157, "language_loss": 0.83319074, "learning_rate": 3.936248679742983e-06, "loss": 0.85585791, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 2.5299389362335205 }, { "auxiliary_loss_clip": 0.01098339, "auxiliary_loss_mlp": 0.01008545, "balance_loss_clip": 1.03250122, "balance_loss_mlp": 1.00520742, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.0472724456862286, "language_loss": 0.7019968, "learning_rate": 3.936053423790959e-06, "loss": 0.72306561, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 2.9408888816833496 }, { "auxiliary_loss_clip": 0.01246435, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.07119405, "balance_loss_mlp": 1.03173327, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 1.7430881584978353, "language_loss": 0.77579689, "learning_rate": 3.935857874141168e-06, "loss": 0.7986849, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 3.32848858833313 }, { "auxiliary_loss_clip": 0.01208899, "auxiliary_loss_mlp": 0.01042285, "balance_loss_clip": 1.06662869, "balance_loss_mlp": 1.03148472, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 2.2394725777520694, "language_loss": 0.83277917, "learning_rate": 3.935662030823279e-06, "loss": 0.85529101, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 2.501368522644043 }, { "auxiliary_loss_clip": 0.01237746, "auxiliary_loss_mlp": 0.01039626, "balance_loss_clip": 1.07016683, "balance_loss_mlp": 1.02827084, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 2.6532362540217087, "language_loss": 0.72506303, "learning_rate": 3.935465893866998e-06, "loss": 0.74783671, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 2.5257515907287598 }, { "auxiliary_loss_clip": 0.01218221, "auxiliary_loss_mlp": 0.01042324, "balance_loss_clip": 1.07023072, "balance_loss_mlp": 1.03169703, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 1.9888446048696702, "language_loss": 0.80032659, "learning_rate": 3.935269463302079e-06, "loss": 0.82293206, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 4.054166555404663 }, { "auxiliary_loss_clip": 0.01239324, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.0718143, "balance_loss_mlp": 1.02771807, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 1.978536199947901, "language_loss": 0.76860577, "learning_rate": 3.935072739158322e-06, "loss": 0.79140645, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 2.525634527206421 }, { "auxiliary_loss_clip": 0.01217747, "auxiliary_loss_mlp": 0.01044287, "balance_loss_clip": 1.06756938, "balance_loss_mlp": 1.03344536, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 1.513345247887908, "language_loss": 0.7973218, "learning_rate": 3.934875721465569e-06, "loss": 0.81994212, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 3.4201667308807373 }, { "auxiliary_loss_clip": 0.01215738, "auxiliary_loss_mlp": 0.01049837, "balance_loss_clip": 1.06522679, "balance_loss_mlp": 1.0379281, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 2.3763718207639606, "language_loss": 0.71539545, "learning_rate": 3.9346784102537076e-06, "loss": 0.73805118, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.657378673553467 }, { "auxiliary_loss_clip": 0.01243968, "auxiliary_loss_mlp": 0.01045357, "balance_loss_clip": 1.06807375, "balance_loss_mlp": 1.03514671, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 1.814088302175646, "language_loss": 0.78386641, "learning_rate": 3.934480805552669e-06, "loss": 0.8067596, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.504197597503662 }, { "auxiliary_loss_clip": 0.01246513, "auxiliary_loss_mlp": 0.00763976, "balance_loss_clip": 1.07114446, "balance_loss_mlp": 1.00010991, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 2.0383579493322004, "language_loss": 0.87991786, "learning_rate": 3.93428290739243e-06, "loss": 0.90002275, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.487342119216919 }, { "auxiliary_loss_clip": 0.01217432, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.0677706, "balance_loss_mlp": 1.02446222, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.232268071685451, "language_loss": 0.79870701, "learning_rate": 3.9340847158030125e-06, "loss": 0.82124031, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.5157434940338135 }, { "auxiliary_loss_clip": 0.01234488, "auxiliary_loss_mlp": 0.01047605, "balance_loss_clip": 1.06924248, "balance_loss_mlp": 1.03698325, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 1.853598038185026, "language_loss": 0.75309628, "learning_rate": 3.9338862308144814e-06, "loss": 0.77591711, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.502610683441162 }, { "auxiliary_loss_clip": 0.01248307, "auxiliary_loss_mlp": 0.0104701, "balance_loss_clip": 1.07165384, "balance_loss_mlp": 1.03662121, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 1.6280666877085248, "language_loss": 0.84357464, "learning_rate": 3.933687452456946e-06, "loss": 0.86652786, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.4641408920288086 }, { "auxiliary_loss_clip": 0.0119383, "auxiliary_loss_mlp": 0.01037327, "balance_loss_clip": 1.0591464, "balance_loss_mlp": 1.02501249, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 4.03532649811657, "language_loss": 0.86149353, "learning_rate": 3.933488380760562e-06, "loss": 0.88380516, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.5571024417877197 }, { "auxiliary_loss_clip": 0.01248567, "auxiliary_loss_mlp": 0.00764528, "balance_loss_clip": 1.07145095, "balance_loss_mlp": 1.00008321, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 1.9794344043463137, "language_loss": 0.87152588, "learning_rate": 3.9332890157555286e-06, "loss": 0.89165682, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.4526405334472656 }, { "auxiliary_loss_clip": 0.01221672, "auxiliary_loss_mlp": 0.0104699, "balance_loss_clip": 1.0684762, "balance_loss_mlp": 1.03568292, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 2.289663525020898, "language_loss": 0.76395983, "learning_rate": 3.933089357472088e-06, "loss": 0.78664649, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.4978785514831543 }, { "auxiliary_loss_clip": 0.01249824, "auxiliary_loss_mlp": 0.01044248, "balance_loss_clip": 1.0734427, "balance_loss_mlp": 1.03328633, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 1.7778952291835857, "language_loss": 0.85864955, "learning_rate": 3.932889405940529e-06, "loss": 0.88159031, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.488654136657715 }, { "auxiliary_loss_clip": 0.01223596, "auxiliary_loss_mlp": 0.01042277, "balance_loss_clip": 1.07454467, "balance_loss_mlp": 1.03145242, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.164698477174344, "language_loss": 0.79746985, "learning_rate": 3.932689161191184e-06, "loss": 0.82012856, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.527545928955078 }, { "auxiliary_loss_clip": 0.01230893, "auxiliary_loss_mlp": 0.01043044, "balance_loss_clip": 1.06721413, "balance_loss_mlp": 1.03174317, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 2.9836227759873966, "language_loss": 0.87975824, "learning_rate": 3.93248862325443e-06, "loss": 0.90249759, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.527946949005127 }, { "auxiliary_loss_clip": 0.01130113, "auxiliary_loss_mlp": 0.01007582, "balance_loss_clip": 1.03725338, "balance_loss_mlp": 1.00422049, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9480432606464496, "language_loss": 0.6446889, "learning_rate": 3.932287792160688e-06, "loss": 0.66606587, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.0086216926574707 }, { "auxiliary_loss_clip": 0.01233536, "auxiliary_loss_mlp": 0.01057737, "balance_loss_clip": 1.0672487, "balance_loss_mlp": 1.04593492, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 2.3336433656492157, "language_loss": 0.80610073, "learning_rate": 3.932086667940424e-06, "loss": 0.82901341, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.5176546573638916 }, { "auxiliary_loss_clip": 0.01232069, "auxiliary_loss_mlp": 0.00764056, "balance_loss_clip": 1.07112169, "balance_loss_mlp": 1.00010467, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 1.7945901445490888, "language_loss": 0.81595868, "learning_rate": 3.93188525062415e-06, "loss": 0.83591998, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.5978198051452637 }, { "auxiliary_loss_clip": 0.0123443, "auxiliary_loss_mlp": 0.01047624, "balance_loss_clip": 1.06924939, "balance_loss_mlp": 1.03533983, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 2.0473126143712803, "language_loss": 0.85970914, "learning_rate": 3.931683540242418e-06, "loss": 0.88252968, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.57163143157959 }, { "auxiliary_loss_clip": 0.01228415, "auxiliary_loss_mlp": 0.01044187, "balance_loss_clip": 1.06761456, "balance_loss_mlp": 1.03316021, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 2.5377431758959603, "language_loss": 0.9105643, "learning_rate": 3.9314815368258295e-06, "loss": 0.93329036, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.5258777141571045 }, { "auxiliary_loss_clip": 0.01238387, "auxiliary_loss_mlp": 0.01038982, "balance_loss_clip": 1.07390344, "balance_loss_mlp": 1.02868247, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 1.7252943364941904, "language_loss": 0.78803492, "learning_rate": 3.9312792404050275e-06, "loss": 0.81080866, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.5064358711242676 }, { "auxiliary_loss_clip": 0.01244987, "auxiliary_loss_mlp": 0.01047185, "balance_loss_clip": 1.07118607, "balance_loss_mlp": 1.03727221, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 5.115479374243775, "language_loss": 0.77323759, "learning_rate": 3.9310766510107e-06, "loss": 0.79615927, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.513786554336548 }, { "auxiliary_loss_clip": 0.01204415, "auxiliary_loss_mlp": 0.01046335, "balance_loss_clip": 1.06521225, "balance_loss_mlp": 1.03349042, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 1.8579085080426099, "language_loss": 0.9196015, "learning_rate": 3.9308737686735806e-06, "loss": 0.94210899, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 2.6081809997558594 }, { "auxiliary_loss_clip": 0.0124958, "auxiliary_loss_mlp": 0.01050704, "balance_loss_clip": 1.07328749, "balance_loss_mlp": 1.03933096, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.2441634463238933, "language_loss": 0.82748544, "learning_rate": 3.9306705934244455e-06, "loss": 0.85048831, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 3.2197580337524414 }, { "auxiliary_loss_clip": 0.01204324, "auxiliary_loss_mlp": 0.01050717, "balance_loss_clip": 1.06646705, "balance_loss_mlp": 1.04034007, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 1.9821112777867815, "language_loss": 0.88482773, "learning_rate": 3.930467125294116e-06, "loss": 0.90737808, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 2.556999683380127 }, { "auxiliary_loss_clip": 0.01069391, "auxiliary_loss_mlp": 0.01007343, "balance_loss_clip": 1.03120768, "balance_loss_mlp": 1.00386238, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9265973861644086, "language_loss": 0.60494685, "learning_rate": 3.930263364313458e-06, "loss": 0.62571418, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 3.0353214740753174 }, { "auxiliary_loss_clip": 0.01201746, "auxiliary_loss_mlp": 0.01042379, "balance_loss_clip": 1.06627882, "balance_loss_mlp": 1.03148973, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 1.886443152541927, "language_loss": 0.83034509, "learning_rate": 3.930059310513384e-06, "loss": 0.85278636, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 3.982970714569092 }, { "auxiliary_loss_clip": 0.01188293, "auxiliary_loss_mlp": 0.00764389, "balance_loss_clip": 1.0641613, "balance_loss_mlp": 1.00009131, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.8127654205846255, "language_loss": 0.83984828, "learning_rate": 3.929854963924846e-06, "loss": 0.85937512, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 2.6635072231292725 }, { "auxiliary_loss_clip": 0.01199909, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.06283259, "balance_loss_mlp": 1.03011894, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 1.9112604564073543, "language_loss": 0.77346182, "learning_rate": 3.929650324578845e-06, "loss": 0.79586577, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 2.5690670013427734 }, { "auxiliary_loss_clip": 0.01217117, "auxiliary_loss_mlp": 0.01048947, "balance_loss_clip": 1.0674305, "balance_loss_mlp": 1.03661537, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 3.5620467060466203, "language_loss": 0.82116652, "learning_rate": 3.929445392506423e-06, "loss": 0.84382713, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 3.3918347358703613 }, { "auxiliary_loss_clip": 0.01234024, "auxiliary_loss_mlp": 0.01045207, "balance_loss_clip": 1.07558608, "balance_loss_mlp": 1.03468692, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 1.9692801396890847, "language_loss": 0.76019627, "learning_rate": 3.92924016773867e-06, "loss": 0.78298855, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.5194613933563232 }, { "auxiliary_loss_clip": 0.0121292, "auxiliary_loss_mlp": 0.00763087, "balance_loss_clip": 1.06406093, "balance_loss_mlp": 1.00007117, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.277876458523468, "language_loss": 0.7327826, "learning_rate": 3.9290346503067175e-06, "loss": 0.75254261, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.5319368839263916 }, { "auxiliary_loss_clip": 0.01233592, "auxiliary_loss_mlp": 0.01048839, "balance_loss_clip": 1.06894588, "balance_loss_mlp": 1.03779435, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 1.737440878259158, "language_loss": 0.78951848, "learning_rate": 3.9288288402417415e-06, "loss": 0.81234282, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 2.8238298892974854 }, { "auxiliary_loss_clip": 0.01236841, "auxiliary_loss_mlp": 0.01042997, "balance_loss_clip": 1.07390261, "balance_loss_mlp": 1.03097486, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 2.259250823817212, "language_loss": 0.70620245, "learning_rate": 3.928622737574964e-06, "loss": 0.72900081, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.4704511165618896 }, { "auxiliary_loss_clip": 0.01216082, "auxiliary_loss_mlp": 0.01050647, "balance_loss_clip": 1.06690454, "balance_loss_mlp": 1.04009724, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 1.9025600073114202, "language_loss": 0.90966594, "learning_rate": 3.928416342337652e-06, "loss": 0.93233329, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.5807857513427734 }, { "auxiliary_loss_clip": 0.01219274, "auxiliary_loss_mlp": 0.01047018, "balance_loss_clip": 1.06977141, "balance_loss_mlp": 1.03699839, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 1.8355422218335449, "language_loss": 0.82640743, "learning_rate": 3.928209654561113e-06, "loss": 0.84907043, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.5475242137908936 }, { "auxiliary_loss_clip": 0.01210881, "auxiliary_loss_mlp": 0.01051042, "balance_loss_clip": 1.06972957, "balance_loss_mlp": 1.0413444, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 2.4165339146653837, "language_loss": 0.81280476, "learning_rate": 3.928002674276703e-06, "loss": 0.83542395, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.561915636062622 }, { "auxiliary_loss_clip": 0.01160727, "auxiliary_loss_mlp": 0.0103903, "balance_loss_clip": 1.05465233, "balance_loss_mlp": 1.02787828, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 2.277653261927664, "language_loss": 0.75449318, "learning_rate": 3.92779540151582e-06, "loss": 0.77649075, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.5538668632507324 }, { "auxiliary_loss_clip": 0.01214587, "auxiliary_loss_mlp": 0.01047391, "balance_loss_clip": 1.06747031, "balance_loss_mlp": 1.03654921, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 1.7691617912251245, "language_loss": 0.85655558, "learning_rate": 3.927587836309907e-06, "loss": 0.87917536, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.5838091373443604 }, { "auxiliary_loss_clip": 0.01207, "auxiliary_loss_mlp": 0.01049268, "balance_loss_clip": 1.06298161, "balance_loss_mlp": 1.03833663, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 14.847520825832634, "language_loss": 0.78070915, "learning_rate": 3.927379978690452e-06, "loss": 0.80327189, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.5879645347595215 }, { "auxiliary_loss_clip": 0.01187581, "auxiliary_loss_mlp": 0.01042993, "balance_loss_clip": 1.05752409, "balance_loss_mlp": 1.03296208, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 2.6487092013253197, "language_loss": 0.87392259, "learning_rate": 3.927171828688987e-06, "loss": 0.89622831, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.6133930683135986 }, { "auxiliary_loss_clip": 0.01243782, "auxiliary_loss_mlp": 0.01036419, "balance_loss_clip": 1.07018054, "balance_loss_mlp": 1.02614963, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 2.312353961959972, "language_loss": 0.82289994, "learning_rate": 3.926963386337088e-06, "loss": 0.84570193, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.535149097442627 }, { "auxiliary_loss_clip": 0.01251538, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.07483101, "balance_loss_mlp": 1.03039074, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 2.2371452498724365, "language_loss": 0.70287466, "learning_rate": 3.926754651666375e-06, "loss": 0.72581053, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.6291024684906006 }, { "auxiliary_loss_clip": 0.01201283, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.06741762, "balance_loss_mlp": 1.03469276, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 2.4956148790224066, "language_loss": 0.78069115, "learning_rate": 3.926545624708513e-06, "loss": 0.80315685, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.598841667175293 }, { "auxiliary_loss_clip": 0.01199208, "auxiliary_loss_mlp": 0.01041078, "balance_loss_clip": 1.06460309, "balance_loss_mlp": 1.03085542, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 2.0515818943790105, "language_loss": 0.85824728, "learning_rate": 3.926336305495213e-06, "loss": 0.88065004, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.575958490371704 }, { "auxiliary_loss_clip": 0.01186199, "auxiliary_loss_mlp": 0.01043695, "balance_loss_clip": 1.06077933, "balance_loss_mlp": 1.03335392, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 3.102117660624367, "language_loss": 0.88800848, "learning_rate": 3.926126694058226e-06, "loss": 0.91030741, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.5694007873535156 }, { "auxiliary_loss_clip": 0.01187571, "auxiliary_loss_mlp": 0.01051143, "balance_loss_clip": 1.06797028, "balance_loss_mlp": 1.04102802, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.432414656962617, "language_loss": 0.82136917, "learning_rate": 3.92591679042935e-06, "loss": 0.8437562, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.606205940246582 }, { "auxiliary_loss_clip": 0.01230191, "auxiliary_loss_mlp": 0.01041722, "balance_loss_clip": 1.07095098, "balance_loss_mlp": 1.02980101, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 1.5893476569464955, "language_loss": 0.82427156, "learning_rate": 3.92570659464043e-06, "loss": 0.8469907, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.5002236366271973 }, { "auxiliary_loss_clip": 0.01228701, "auxiliary_loss_mlp": 0.00763462, "balance_loss_clip": 1.07038617, "balance_loss_mlp": 1.00008583, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 2.090783299752373, "language_loss": 0.79642951, "learning_rate": 3.925496106723349e-06, "loss": 0.81635112, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.4995129108428955 }, { "auxiliary_loss_clip": 0.01228695, "auxiliary_loss_mlp": 0.01048317, "balance_loss_clip": 1.06869578, "balance_loss_mlp": 1.03825557, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 2.6282307248863397, "language_loss": 0.83615541, "learning_rate": 3.9252853267100405e-06, "loss": 0.85892558, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 3.296628713607788 }, { "auxiliary_loss_clip": 0.0119258, "auxiliary_loss_mlp": 0.01042857, "balance_loss_clip": 1.06709409, "balance_loss_mlp": 1.03197908, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 1.789309610338752, "language_loss": 0.8371259, "learning_rate": 3.9250742546324786e-06, "loss": 0.85948026, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 2.604696035385132 }, { "auxiliary_loss_clip": 0.01212603, "auxiliary_loss_mlp": 0.0104542, "balance_loss_clip": 1.06500053, "balance_loss_mlp": 1.03526366, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 1.6901784695171744, "language_loss": 0.8700887, "learning_rate": 3.924862890522683e-06, "loss": 0.89266896, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 2.6163079738616943 }, { "auxiliary_loss_clip": 0.01227848, "auxiliary_loss_mlp": 0.01057026, "balance_loss_clip": 1.06500506, "balance_loss_mlp": 1.04516435, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 2.1419889500632032, "language_loss": 0.85912532, "learning_rate": 3.9246512344127174e-06, "loss": 0.88197404, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 4.208640098571777 }, { "auxiliary_loss_clip": 0.01148945, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.056409, "balance_loss_mlp": 1.02759731, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 2.104975868089068, "language_loss": 0.81794298, "learning_rate": 3.9244392863346895e-06, "loss": 0.83980858, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 2.8621292114257812 }, { "auxiliary_loss_clip": 0.01216496, "auxiliary_loss_mlp": 0.01053741, "balance_loss_clip": 1.07067525, "balance_loss_mlp": 1.04220712, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 1.9126115079367951, "language_loss": 0.92378235, "learning_rate": 3.9242270463207524e-06, "loss": 0.94648468, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 2.5266430377960205 }, { "auxiliary_loss_clip": 0.01169277, "auxiliary_loss_mlp": 0.01041489, "balance_loss_clip": 1.06025088, "balance_loss_mlp": 1.0302062, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 2.612402565174101, "language_loss": 0.85360587, "learning_rate": 3.924014514403102e-06, "loss": 0.87571359, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 3.428123950958252 }, { "auxiliary_loss_clip": 0.01170613, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.05839956, "balance_loss_mlp": 1.03558445, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 1.9705249159668405, "language_loss": 0.9130134, "learning_rate": 3.92380169061398e-06, "loss": 0.93519199, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.6064140796661377 }, { "auxiliary_loss_clip": 0.01188076, "auxiliary_loss_mlp": 0.00764472, "balance_loss_clip": 1.05784774, "balance_loss_mlp": 1.00009096, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 1.9084322123045003, "language_loss": 0.83613479, "learning_rate": 3.9235885749856705e-06, "loss": 0.85566032, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.6071629524230957 }, { "auxiliary_loss_clip": 0.01214638, "auxiliary_loss_mlp": 0.01044543, "balance_loss_clip": 1.06970787, "balance_loss_mlp": 1.03370142, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 2.0163073596184913, "language_loss": 0.82715619, "learning_rate": 3.9233751675505035e-06, "loss": 0.84974802, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.5311121940612793 }, { "auxiliary_loss_clip": 0.01211543, "auxiliary_loss_mlp": 0.01047984, "balance_loss_clip": 1.06771255, "balance_loss_mlp": 1.03649259, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 2.3879310253878088, "language_loss": 0.85124761, "learning_rate": 3.923161468340853e-06, "loss": 0.87384284, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.548724412918091 }, { "auxiliary_loss_clip": 0.01164596, "auxiliary_loss_mlp": 0.01047491, "balance_loss_clip": 1.05488515, "balance_loss_mlp": 1.03645849, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 2.045262166896079, "language_loss": 0.81566608, "learning_rate": 3.9229474773891374e-06, "loss": 0.83778691, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.6004765033721924 }, { "auxiliary_loss_clip": 0.01202082, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 1.05937195, "balance_loss_mlp": 1.02982545, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 2.0188323925548213, "language_loss": 0.83729315, "learning_rate": 3.922733194727818e-06, "loss": 0.85973096, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.6260299682617188 }, { "auxiliary_loss_clip": 0.01233903, "auxiliary_loss_mlp": 0.01049225, "balance_loss_clip": 1.07061553, "balance_loss_mlp": 1.037745, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 1.9403762773585176, "language_loss": 0.87334752, "learning_rate": 3.922518620389402e-06, "loss": 0.89617884, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.509111166000366 }, { "auxiliary_loss_clip": 0.01124698, "auxiliary_loss_mlp": 0.01043963, "balance_loss_clip": 1.05446291, "balance_loss_mlp": 1.03324056, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 1.7170641848430415, "language_loss": 0.89445019, "learning_rate": 3.922303754406439e-06, "loss": 0.91613686, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.72664737701416 }, { "auxiliary_loss_clip": 0.01172959, "auxiliary_loss_mlp": 0.01043741, "balance_loss_clip": 1.05503273, "balance_loss_mlp": 1.03269625, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 2.0080600610882526, "language_loss": 0.78988111, "learning_rate": 3.922088596811526e-06, "loss": 0.81204808, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.740123987197876 }, { "auxiliary_loss_clip": 0.01214807, "auxiliary_loss_mlp": 0.01049977, "balance_loss_clip": 1.06466746, "balance_loss_mlp": 1.03863406, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 2.35869317590607, "language_loss": 0.86652577, "learning_rate": 3.9218731476373e-06, "loss": 0.88917363, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.4864609241485596 }, { "auxiliary_loss_clip": 0.01229663, "auxiliary_loss_mlp": 0.01037193, "balance_loss_clip": 1.06900525, "balance_loss_mlp": 1.02540922, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 2.180462380128972, "language_loss": 0.84922028, "learning_rate": 3.9216574069164455e-06, "loss": 0.87188888, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.5146942138671875 }, { "auxiliary_loss_clip": 0.01239151, "auxiliary_loss_mlp": 0.01043031, "balance_loss_clip": 1.06715441, "balance_loss_mlp": 1.03293383, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.5684329703262403, "language_loss": 0.80336547, "learning_rate": 3.921441374681691e-06, "loss": 0.82618725, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.496046304702759 }, { "auxiliary_loss_clip": 0.01209078, "auxiliary_loss_mlp": 0.01039586, "balance_loss_clip": 1.06617749, "balance_loss_mlp": 1.02924454, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 1.815288909968163, "language_loss": 0.64879394, "learning_rate": 3.921225050965808e-06, "loss": 0.6712805, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.567025661468506 }, { "auxiliary_loss_clip": 0.011969, "auxiliary_loss_mlp": 0.01044693, "balance_loss_clip": 1.06524003, "balance_loss_mlp": 1.0336777, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 2.046361113843374, "language_loss": 0.75033188, "learning_rate": 3.921008435801612e-06, "loss": 0.77274776, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.5719377994537354 }, { "auxiliary_loss_clip": 0.01213001, "auxiliary_loss_mlp": 0.01047457, "balance_loss_clip": 1.06314492, "balance_loss_mlp": 1.03598344, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 2.1182193404652154, "language_loss": 0.7572149, "learning_rate": 3.920791529221963e-06, "loss": 0.77981949, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.5028796195983887 }, { "auxiliary_loss_clip": 0.01215737, "auxiliary_loss_mlp": 0.00764091, "balance_loss_clip": 1.06563377, "balance_loss_mlp": 1.00011647, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 1.9582907001725058, "language_loss": 0.7651614, "learning_rate": 3.920574331259768e-06, "loss": 0.78495967, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.5761897563934326 }, { "auxiliary_loss_clip": 0.01202108, "auxiliary_loss_mlp": 0.01045077, "balance_loss_clip": 1.06470919, "balance_loss_mlp": 1.03384185, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 2.122575776477431, "language_loss": 0.79257643, "learning_rate": 3.9203568419479716e-06, "loss": 0.81504828, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.534876823425293 }, { "auxiliary_loss_clip": 0.01209215, "auxiliary_loss_mlp": 0.01040546, "balance_loss_clip": 1.06407452, "balance_loss_mlp": 1.02985883, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 1.830706593722671, "language_loss": 0.74941134, "learning_rate": 3.92013906131957e-06, "loss": 0.77190888, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.526524066925049 }, { "auxiliary_loss_clip": 0.01193855, "auxiliary_loss_mlp": 0.01049449, "balance_loss_clip": 1.06303024, "balance_loss_mlp": 1.03899395, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 1.615912471773369, "language_loss": 0.82558942, "learning_rate": 3.9199209894076e-06, "loss": 0.84802246, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 2.5733649730682373 }, { "auxiliary_loss_clip": 0.0124346, "auxiliary_loss_mlp": 0.01050297, "balance_loss_clip": 1.06771302, "balance_loss_mlp": 1.0378691, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 1.8940638460998847, "language_loss": 0.90368426, "learning_rate": 3.919702626245142e-06, "loss": 0.92662185, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 3.227565050125122 }, { "auxiliary_loss_clip": 0.01196298, "auxiliary_loss_mlp": 0.01041078, "balance_loss_clip": 1.06167197, "balance_loss_mlp": 1.03058147, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 2.0361380209453577, "language_loss": 0.66328055, "learning_rate": 3.919483971865322e-06, "loss": 0.68565434, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 2.5614726543426514 }, { "auxiliary_loss_clip": 0.01210102, "auxiliary_loss_mlp": 0.01042892, "balance_loss_clip": 1.06693888, "balance_loss_mlp": 1.03231215, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 2.1589731525977682, "language_loss": 0.87997955, "learning_rate": 3.91926502630131e-06, "loss": 0.90250951, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 3.320653200149536 }, { "auxiliary_loss_clip": 0.01231064, "auxiliary_loss_mlp": 0.01040098, "balance_loss_clip": 1.07202613, "balance_loss_mlp": 1.03068686, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 2.2612598626659897, "language_loss": 0.72188574, "learning_rate": 3.91904578958632e-06, "loss": 0.74459732, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 3.2702271938323975 }, { "auxiliary_loss_clip": 0.01243829, "auxiliary_loss_mlp": 0.0104274, "balance_loss_clip": 1.06949043, "balance_loss_mlp": 1.0311408, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 2.1903680872947096, "language_loss": 0.84238833, "learning_rate": 3.918826261753608e-06, "loss": 0.86525404, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 2.4896678924560547 }, { "auxiliary_loss_clip": 0.01210961, "auxiliary_loss_mlp": 0.01046308, "balance_loss_clip": 1.06473994, "balance_loss_mlp": 1.03587687, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 3.364835766975984, "language_loss": 0.71299851, "learning_rate": 3.918606442836478e-06, "loss": 0.73557115, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 3.4297571182250977 }, { "auxiliary_loss_clip": 0.01227096, "auxiliary_loss_mlp": 0.01047707, "balance_loss_clip": 1.07068181, "balance_loss_mlp": 1.0372349, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.7814807041830716, "language_loss": 0.77578145, "learning_rate": 3.918386332868277e-06, "loss": 0.79852951, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.496600866317749 }, { "auxiliary_loss_clip": 0.01212346, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.06438839, "balance_loss_mlp": 1.0332185, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 1.9041496624056555, "language_loss": 0.94322979, "learning_rate": 3.918165931882394e-06, "loss": 0.96579218, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.4803590774536133 }, { "auxiliary_loss_clip": 0.01153236, "auxiliary_loss_mlp": 0.01046308, "balance_loss_clip": 1.05304921, "balance_loss_mlp": 1.03509068, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 17.62611678046567, "language_loss": 0.7547071, "learning_rate": 3.917945239912264e-06, "loss": 0.77670258, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.614936113357544 }, { "auxiliary_loss_clip": 0.01178602, "auxiliary_loss_mlp": 0.01039397, "balance_loss_clip": 1.0602349, "balance_loss_mlp": 1.02894831, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 2.0324308378220324, "language_loss": 0.75453484, "learning_rate": 3.917724256991367e-06, "loss": 0.77671486, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.578970193862915 }, { "auxiliary_loss_clip": 0.01196501, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.06219161, "balance_loss_mlp": 1.02698135, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 3.1331028476364158, "language_loss": 0.81216979, "learning_rate": 3.9175029831532245e-06, "loss": 0.83450472, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.6072514057159424 }, { "auxiliary_loss_clip": 0.01197633, "auxiliary_loss_mlp": 0.01038387, "balance_loss_clip": 1.0647769, "balance_loss_mlp": 1.02765238, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 2.0083885017395473, "language_loss": 0.88945067, "learning_rate": 3.917281418431404e-06, "loss": 0.91181087, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.577902317047119 }, { "auxiliary_loss_clip": 0.0120928, "auxiliary_loss_mlp": 0.01050016, "balance_loss_clip": 1.06682706, "balance_loss_mlp": 1.03953743, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 2.4028194144844246, "language_loss": 0.76611543, "learning_rate": 3.917059562859516e-06, "loss": 0.78870845, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.555708169937134 }, { "auxiliary_loss_clip": 0.0120201, "auxiliary_loss_mlp": 0.01045265, "balance_loss_clip": 1.06535006, "balance_loss_mlp": 1.03535318, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.1386527664444506, "language_loss": 0.88685012, "learning_rate": 3.916837416471218e-06, "loss": 0.90932292, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.573564291000366 }, { "auxiliary_loss_clip": 0.01218084, "auxiliary_loss_mlp": 0.01037131, "balance_loss_clip": 1.06228542, "balance_loss_mlp": 1.02732015, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 2.363845444000928, "language_loss": 0.72483838, "learning_rate": 3.916614979300207e-06, "loss": 0.74739045, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.48179030418396 }, { "auxiliary_loss_clip": 0.01170348, "auxiliary_loss_mlp": 0.01047764, "balance_loss_clip": 1.0617075, "balance_loss_mlp": 1.03817368, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 2.350948582833249, "language_loss": 0.78669196, "learning_rate": 3.9163922513802274e-06, "loss": 0.80887306, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.669786214828491 }, { "auxiliary_loss_clip": 0.01241704, "auxiliary_loss_mlp": 0.0104019, "balance_loss_clip": 1.0674206, "balance_loss_mlp": 1.02889538, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 5.665754895251741, "language_loss": 0.82662559, "learning_rate": 3.916169232745067e-06, "loss": 0.84944451, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.449256658554077 }, { "auxiliary_loss_clip": 0.01193978, "auxiliary_loss_mlp": 0.01042703, "balance_loss_clip": 1.05947232, "balance_loss_mlp": 1.03119898, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 2.7984057589196656, "language_loss": 0.92141402, "learning_rate": 3.915945923428559e-06, "loss": 0.9437809, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.51652455329895 }, { "auxiliary_loss_clip": 0.01219222, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.06218028, "balance_loss_mlp": 1.02565324, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 2.196823570379757, "language_loss": 0.83077949, "learning_rate": 3.915722323464577e-06, "loss": 0.85333478, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.489825487136841 }, { "auxiliary_loss_clip": 0.01224061, "auxiliary_loss_mlp": 0.01039539, "balance_loss_clip": 1.06665778, "balance_loss_mlp": 1.02829814, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 2.287014030481074, "language_loss": 0.70666665, "learning_rate": 3.91549843288704e-06, "loss": 0.72930264, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.747929573059082 }, { "auxiliary_loss_clip": 0.01194854, "auxiliary_loss_mlp": 0.00764529, "balance_loss_clip": 1.05817819, "balance_loss_mlp": 1.00008881, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 2.205937005047137, "language_loss": 0.78925496, "learning_rate": 3.915274251729916e-06, "loss": 0.8088488, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.6223549842834473 }, { "auxiliary_loss_clip": 0.0120193, "auxiliary_loss_mlp": 0.01039759, "balance_loss_clip": 1.06504798, "balance_loss_mlp": 1.02836823, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 1.8510918185789669, "language_loss": 0.90102053, "learning_rate": 3.91504978002721e-06, "loss": 0.92343736, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.5627899169921875 }, { "auxiliary_loss_clip": 0.0120985, "auxiliary_loss_mlp": 0.00764003, "balance_loss_clip": 1.06157613, "balance_loss_mlp": 1.00008309, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 1.8351333856395746, "language_loss": 0.76219308, "learning_rate": 3.914825017812974e-06, "loss": 0.78193158, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.506192445755005 }, { "auxiliary_loss_clip": 0.01210395, "auxiliary_loss_mlp": 0.01034836, "balance_loss_clip": 1.06747937, "balance_loss_mlp": 1.0246377, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 2.3618947865169138, "language_loss": 0.72538304, "learning_rate": 3.9145999651213065e-06, "loss": 0.7478354, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.5486679077148438 }, { "auxiliary_loss_clip": 0.01228804, "auxiliary_loss_mlp": 0.01045953, "balance_loss_clip": 1.07009506, "balance_loss_mlp": 1.03474689, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 2.941037161534408, "language_loss": 0.88428086, "learning_rate": 3.9143746219863465e-06, "loss": 0.90702844, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.478365421295166 }, { "auxiliary_loss_clip": 0.01127148, "auxiliary_loss_mlp": 0.01010518, "balance_loss_clip": 1.04374146, "balance_loss_mlp": 1.00746667, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.946783332347811, "language_loss": 0.64760125, "learning_rate": 3.914148988442278e-06, "loss": 0.66897792, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 3.1101934909820557 }, { "auxiliary_loss_clip": 0.01193567, "auxiliary_loss_mlp": 0.01043695, "balance_loss_clip": 1.06080675, "balance_loss_mlp": 1.03229272, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 2.557274589996241, "language_loss": 0.94781184, "learning_rate": 3.91392306452333e-06, "loss": 0.9701845, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 3.2965023517608643 }, { "auxiliary_loss_clip": 0.01241209, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.06694961, "balance_loss_mlp": 1.0306108, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 3.089521166468796, "language_loss": 0.66314483, "learning_rate": 3.913696850263774e-06, "loss": 0.68597078, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 2.4378108978271484 }, { "auxiliary_loss_clip": 0.01226125, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.06631136, "balance_loss_mlp": 1.02385378, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 1.948507140410761, "language_loss": 0.79312527, "learning_rate": 3.913470345697929e-06, "loss": 0.81573302, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 2.5120370388031006 }, { "auxiliary_loss_clip": 0.0117809, "auxiliary_loss_mlp": 0.01042639, "balance_loss_clip": 1.06074333, "balance_loss_mlp": 1.0321312, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.147806731130658, "language_loss": 0.85453367, "learning_rate": 3.913243550860153e-06, "loss": 0.87674093, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 4.116698980331421 }, { "auxiliary_loss_clip": 0.01228214, "auxiliary_loss_mlp": 0.01040269, "balance_loss_clip": 1.07086015, "balance_loss_mlp": 1.02935588, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 2.842664618129696, "language_loss": 0.75874376, "learning_rate": 3.913016465784852e-06, "loss": 0.78142858, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 2.5714364051818848 }, { "auxiliary_loss_clip": 0.01174774, "auxiliary_loss_mlp": 0.01042177, "balance_loss_clip": 1.05781555, "balance_loss_mlp": 1.03135252, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 2.91900781367874, "language_loss": 0.72013491, "learning_rate": 3.912789090506474e-06, "loss": 0.74230444, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 3.4006738662719727 }, { "auxiliary_loss_clip": 0.01200214, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.0606755, "balance_loss_mlp": 1.02527761, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 4.9035041887337485, "language_loss": 0.71690369, "learning_rate": 3.9125614250595114e-06, "loss": 0.73926765, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.533656358718872 }, { "auxiliary_loss_clip": 0.01227168, "auxiliary_loss_mlp": 0.01049969, "balance_loss_clip": 1.06686234, "balance_loss_mlp": 1.03878081, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 2.8939235399973793, "language_loss": 0.89178646, "learning_rate": 3.912333469478502e-06, "loss": 0.91455787, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.47973895072937 }, { "auxiliary_loss_clip": 0.01212338, "auxiliary_loss_mlp": 0.01042517, "balance_loss_clip": 1.0658381, "balance_loss_mlp": 1.03148365, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 3.3512641495084448, "language_loss": 0.78254145, "learning_rate": 3.912105223798025e-06, "loss": 0.80509001, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 2.5451502799987793 }, { "auxiliary_loss_clip": 0.0111427, "auxiliary_loss_mlp": 0.01007977, "balance_loss_clip": 1.04266477, "balance_loss_mlp": 1.00494945, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 0.999236496528554, "language_loss": 0.6771003, "learning_rate": 3.9118766880527065e-06, "loss": 0.69832277, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.0877015590667725 }, { "auxiliary_loss_clip": 0.01169396, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.05801344, "balance_loss_mlp": 1.03053463, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 1.8313106676261872, "language_loss": 0.73843783, "learning_rate": 3.9116478622772145e-06, "loss": 0.76053441, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.5957770347595215 }, { "auxiliary_loss_clip": 0.01221427, "auxiliary_loss_mlp": 0.01044564, "balance_loss_clip": 1.06845319, "balance_loss_mlp": 1.03365088, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.9327462647473441, "language_loss": 0.88105464, "learning_rate": 3.911418746506261e-06, "loss": 0.90371454, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.591132879257202 }, { "auxiliary_loss_clip": 0.01227371, "auxiliary_loss_mlp": 0.01043338, "balance_loss_clip": 1.07011354, "balance_loss_mlp": 1.03272271, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 3.5264767260250314, "language_loss": 0.78603655, "learning_rate": 3.911189340774604e-06, "loss": 0.8087436, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.509046792984009 }, { "auxiliary_loss_clip": 0.01214909, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.06339788, "balance_loss_mlp": 1.03122067, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 2.291850811442564, "language_loss": 0.79665935, "learning_rate": 3.910959645117043e-06, "loss": 0.81922233, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.5594072341918945 }, { "auxiliary_loss_clip": 0.01115963, "auxiliary_loss_mlp": 0.00754196, "balance_loss_clip": 1.03989565, "balance_loss_mlp": 0.99988312, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.8158658496248952, "language_loss": 0.56706023, "learning_rate": 3.910729659568423e-06, "loss": 0.58576179, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.125624179840088 }, { "auxiliary_loss_clip": 0.01209109, "auxiliary_loss_mlp": 0.01050486, "balance_loss_clip": 1.06729126, "balance_loss_mlp": 1.04064512, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 2.3552641417573708, "language_loss": 0.82110727, "learning_rate": 3.9104993841636344e-06, "loss": 0.84370321, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.6077752113342285 }, { "auxiliary_loss_clip": 0.01206334, "auxiliary_loss_mlp": 0.00763009, "balance_loss_clip": 1.07008576, "balance_loss_mlp": 1.00006676, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.7764777241659957, "language_loss": 0.81089461, "learning_rate": 3.910268818937608e-06, "loss": 0.8305881, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.589017629623413 }, { "auxiliary_loss_clip": 0.01182627, "auxiliary_loss_mlp": 0.0104498, "balance_loss_clip": 1.06673908, "balance_loss_mlp": 1.03384018, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 2.272365399970682, "language_loss": 0.87539548, "learning_rate": 3.9100379639253196e-06, "loss": 0.89767152, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.594097375869751 }, { "auxiliary_loss_clip": 0.01208385, "auxiliary_loss_mlp": 0.01040962, "balance_loss_clip": 1.0630374, "balance_loss_mlp": 1.03034019, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 2.63593625940969, "language_loss": 0.86253965, "learning_rate": 3.909806819161791e-06, "loss": 0.88503313, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.507352590560913 }, { "auxiliary_loss_clip": 0.01197829, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.06184983, "balance_loss_mlp": 1.02923715, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 2.0377459484418408, "language_loss": 0.85922503, "learning_rate": 3.909575384682086e-06, "loss": 0.88160288, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.5611326694488525 }, { "auxiliary_loss_clip": 0.01227751, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.06705713, "balance_loss_mlp": 1.02218926, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 1.8388627303701173, "language_loss": 0.69163442, "learning_rate": 3.9093436605213144e-06, "loss": 0.71424156, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.537940740585327 }, { "auxiliary_loss_clip": 0.01203698, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.06201303, "balance_loss_mlp": 1.03127527, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 2.0210308161222366, "language_loss": 0.7967819, "learning_rate": 3.909111646714627e-06, "loss": 0.81923413, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.579461097717285 }, { "auxiliary_loss_clip": 0.01238131, "auxiliary_loss_mlp": 0.01042365, "balance_loss_clip": 1.06863713, "balance_loss_mlp": 1.03266764, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 2.1369194900366577, "language_loss": 0.72323263, "learning_rate": 3.9088793432972206e-06, "loss": 0.74603754, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.4654159545898438 }, { "auxiliary_loss_clip": 0.01182774, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.06353712, "balance_loss_mlp": 1.03705001, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 2.131551928801967, "language_loss": 0.82159871, "learning_rate": 3.908646750304336e-06, "loss": 0.84389251, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.57853627204895 }, { "auxiliary_loss_clip": 0.01211947, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.06763446, "balance_loss_mlp": 1.02903426, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.5975521024949157, "language_loss": 0.87238926, "learning_rate": 3.908413867771257e-06, "loss": 0.89490801, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.540369987487793 }, { "auxiliary_loss_clip": 0.01226985, "auxiliary_loss_mlp": 0.01050138, "balance_loss_clip": 1.07096887, "balance_loss_mlp": 1.03969574, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 2.2459092524997883, "language_loss": 0.80491227, "learning_rate": 3.908180695733311e-06, "loss": 0.82768351, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 2.5008444786071777 }, { "auxiliary_loss_clip": 0.01151949, "auxiliary_loss_mlp": 0.01038405, "balance_loss_clip": 1.05252504, "balance_loss_mlp": 1.02744436, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 1.9526292427774656, "language_loss": 0.82673395, "learning_rate": 3.907947234225871e-06, "loss": 0.84863746, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 3.433749198913574 }, { "auxiliary_loss_clip": 0.01157618, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.05775177, "balance_loss_mlp": 1.02688885, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 1.7882049050897433, "language_loss": 0.87298417, "learning_rate": 3.907713483284352e-06, "loss": 0.89493525, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 2.632786512374878 }, { "auxiliary_loss_clip": 0.01131701, "auxiliary_loss_mlp": 0.01048709, "balance_loss_clip": 1.05019569, "balance_loss_mlp": 1.03846884, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.1218305339539834, "language_loss": 0.97514492, "learning_rate": 3.907479442944216e-06, "loss": 0.99694896, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 3.471487283706665 }, { "auxiliary_loss_clip": 0.01223738, "auxiliary_loss_mlp": 0.01045104, "balance_loss_clip": 1.0684098, "balance_loss_mlp": 1.03527486, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 2.3013008268563735, "language_loss": 0.92720854, "learning_rate": 3.907245113240963e-06, "loss": 0.94989699, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 3.2621138095855713 }, { "auxiliary_loss_clip": 0.01192164, "auxiliary_loss_mlp": 0.0104114, "balance_loss_clip": 1.06099451, "balance_loss_mlp": 1.03011334, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 1.7731773297659463, "language_loss": 0.7389465, "learning_rate": 3.907010494210144e-06, "loss": 0.76127958, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 2.8302087783813477 }, { "auxiliary_loss_clip": 0.01226856, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.06795359, "balance_loss_mlp": 1.02981997, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 2.3599893918052364, "language_loss": 0.92430198, "learning_rate": 3.9067755858873495e-06, "loss": 0.94697154, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 2.5211563110351562 }, { "auxiliary_loss_clip": 0.01097662, "auxiliary_loss_mlp": 0.01006655, "balance_loss_clip": 1.03346586, "balance_loss_mlp": 1.00376976, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.8616946533442321, "language_loss": 0.6283766, "learning_rate": 3.906540388308214e-06, "loss": 0.64941978, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.9670016765594482 }, { "auxiliary_loss_clip": 0.01162288, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.0609616, "balance_loss_mlp": 1.0230329, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 1.6842429156144205, "language_loss": 0.81365609, "learning_rate": 3.906304901508417e-06, "loss": 0.83560938, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.600846290588379 }, { "auxiliary_loss_clip": 0.01227438, "auxiliary_loss_mlp": 0.010384, "balance_loss_clip": 1.07246244, "balance_loss_mlp": 1.02747488, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.4189520001496287, "language_loss": 0.76166576, "learning_rate": 3.9060691255236835e-06, "loss": 0.78432417, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.5914833545684814 }, { "auxiliary_loss_clip": 0.0121671, "auxiliary_loss_mlp": 0.01061994, "balance_loss_clip": 1.06214261, "balance_loss_mlp": 1.05161726, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 1.7032147333213183, "language_loss": 0.80534184, "learning_rate": 3.905833060389778e-06, "loss": 0.82812893, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.648988723754883 }, { "auxiliary_loss_clip": 0.01238007, "auxiliary_loss_mlp": 0.00763197, "balance_loss_clip": 1.06839764, "balance_loss_mlp": 1.0000236, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 14.611580676521498, "language_loss": 0.78505158, "learning_rate": 3.905596706142513e-06, "loss": 0.80506361, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.55263614654541 }, { "auxiliary_loss_clip": 0.01187099, "auxiliary_loss_mlp": 0.01039232, "balance_loss_clip": 1.06089473, "balance_loss_mlp": 1.02934933, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 2.127013420047938, "language_loss": 0.86045521, "learning_rate": 3.9053600628177435e-06, "loss": 0.8827185, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.642418146133423 }, { "auxiliary_loss_clip": 0.01235644, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.06703401, "balance_loss_mlp": 1.02561402, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 2.2044162268617598, "language_loss": 0.84380829, "learning_rate": 3.905123130451367e-06, "loss": 0.86651516, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.5027005672454834 }, { "auxiliary_loss_clip": 0.01239681, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.07033145, "balance_loss_mlp": 1.03039598, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 1.8941177059674226, "language_loss": 0.79382074, "learning_rate": 3.904885909079326e-06, "loss": 0.81662488, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.5005042552948 }, { "auxiliary_loss_clip": 0.01223366, "auxiliary_loss_mlp": 0.01041051, "balance_loss_clip": 1.06666172, "balance_loss_mlp": 1.03070986, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.507624517364454, "language_loss": 0.78011411, "learning_rate": 3.904648398737607e-06, "loss": 0.80275834, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.5218708515167236 }, { "auxiliary_loss_clip": 0.01234951, "auxiliary_loss_mlp": 0.01034782, "balance_loss_clip": 1.06715405, "balance_loss_mlp": 1.02464294, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 1.9353736721021235, "language_loss": 0.7848376, "learning_rate": 3.9044105994622406e-06, "loss": 0.80753493, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.5868380069732666 }, { "auxiliary_loss_clip": 0.01209905, "auxiliary_loss_mlp": 0.00763811, "balance_loss_clip": 1.06370974, "balance_loss_mlp": 1.00001907, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 1.8846752401754707, "language_loss": 0.81593323, "learning_rate": 3.9041725112893005e-06, "loss": 0.83567035, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.592961311340332 }, { "auxiliary_loss_clip": 0.01184319, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.06369758, "balance_loss_mlp": 1.02505994, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 3.8366249066535074, "language_loss": 0.75072438, "learning_rate": 3.903934134254904e-06, "loss": 0.7729286, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.52116060256958 }, { "auxiliary_loss_clip": 0.01227174, "auxiliary_loss_mlp": 0.01036044, "balance_loss_clip": 1.06580555, "balance_loss_mlp": 1.02560163, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 2.1002996944746184, "language_loss": 0.84769142, "learning_rate": 3.903695468395213e-06, "loss": 0.87032366, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.5044331550598145 }, { "auxiliary_loss_clip": 0.01210677, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.06258106, "balance_loss_mlp": 1.0303582, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 2.113461247603634, "language_loss": 0.55644429, "learning_rate": 3.903456513746434e-06, "loss": 0.57895082, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.604372024536133 }, { "auxiliary_loss_clip": 0.01234611, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.06648862, "balance_loss_mlp": 1.02904713, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 1.8333412222344823, "language_loss": 0.87386739, "learning_rate": 3.903217270344815e-06, "loss": 0.89660329, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.533623218536377 }, { "auxiliary_loss_clip": 0.01188388, "auxiliary_loss_mlp": 0.0103917, "balance_loss_clip": 1.06217551, "balance_loss_mlp": 1.02996135, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 1.7217997555068911, "language_loss": 0.82403624, "learning_rate": 3.902977738226648e-06, "loss": 0.84631181, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.62927508354187 }, { "auxiliary_loss_clip": 0.01226531, "auxiliary_loss_mlp": 0.01037201, "balance_loss_clip": 1.06914687, "balance_loss_mlp": 1.02600074, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 1.8901247559226004, "language_loss": 0.90926504, "learning_rate": 3.902737917428273e-06, "loss": 0.93190235, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.5070865154266357 }, { "auxiliary_loss_clip": 0.01235273, "auxiliary_loss_mlp": 0.01040541, "balance_loss_clip": 1.06722105, "balance_loss_mlp": 1.03018808, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 1.6935110264801863, "language_loss": 0.83799517, "learning_rate": 3.902497807986068e-06, "loss": 0.86075336, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.501563549041748 }, { "auxiliary_loss_clip": 0.01194053, "auxiliary_loss_mlp": 0.01039177, "balance_loss_clip": 1.05978036, "balance_loss_mlp": 1.02860951, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 1.741663241716953, "language_loss": 0.8377679, "learning_rate": 3.902257409936458e-06, "loss": 0.86010021, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.649744987487793 }, { "auxiliary_loss_clip": 0.01210589, "auxiliary_loss_mlp": 0.01042695, "balance_loss_clip": 1.06760371, "balance_loss_mlp": 1.03259778, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 1.9954526018250913, "language_loss": 0.83935523, "learning_rate": 3.902016723315912e-06, "loss": 0.86188811, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 2.5422284603118896 }, { "auxiliary_loss_clip": 0.01217533, "auxiliary_loss_mlp": 0.01046021, "balance_loss_clip": 1.06248784, "balance_loss_mlp": 1.03604937, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 4.1453064297143944, "language_loss": 0.69615984, "learning_rate": 3.901775748160941e-06, "loss": 0.7187953, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 3.2716434001922607 }, { "auxiliary_loss_clip": 0.01111976, "auxiliary_loss_mlp": 0.01009213, "balance_loss_clip": 1.04346955, "balance_loss_mlp": 1.00648272, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.8154381214633049, "language_loss": 0.60871136, "learning_rate": 3.901534484508101e-06, "loss": 0.62992334, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 3.1701419353485107 }, { "auxiliary_loss_clip": 0.01196456, "auxiliary_loss_mlp": 0.01042208, "balance_loss_clip": 1.0625875, "balance_loss_mlp": 1.03169942, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 1.9840385864703185, "language_loss": 0.74875391, "learning_rate": 3.901292932393991e-06, "loss": 0.77114058, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 3.372079372406006 }, { "auxiliary_loss_clip": 0.01242457, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.0721159, "balance_loss_mlp": 1.03060734, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 3.0989016028262593, "language_loss": 0.85295761, "learning_rate": 3.9010510918552555e-06, "loss": 0.87579513, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 3.159975290298462 }, { "auxiliary_loss_clip": 0.01205721, "auxiliary_loss_mlp": 0.01055676, "balance_loss_clip": 1.06359768, "balance_loss_mlp": 1.04441035, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 2.251265036280461, "language_loss": 0.74592358, "learning_rate": 3.900808962928581e-06, "loss": 0.76853752, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 2.6017189025878906 }, { "auxiliary_loss_clip": 0.01239666, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.07162142, "balance_loss_mlp": 1.03125095, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.4556067199211773, "language_loss": 0.89300942, "learning_rate": 3.900566545650698e-06, "loss": 0.91581285, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 2.456106185913086 }, { "auxiliary_loss_clip": 0.0122523, "auxiliary_loss_mlp": 0.01046825, "balance_loss_clip": 1.06893086, "balance_loss_mlp": 1.0362215, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 2.2882648592835912, "language_loss": 0.8186565, "learning_rate": 3.900323840058381e-06, "loss": 0.84137702, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 3.282580852508545 }, { "auxiliary_loss_clip": 0.01220223, "auxiliary_loss_mlp": 0.01034484, "balance_loss_clip": 1.06271076, "balance_loss_mlp": 1.02486968, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 1.8128715621103602, "language_loss": 0.81777501, "learning_rate": 3.900080846188449e-06, "loss": 0.84032208, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.559678554534912 }, { "auxiliary_loss_clip": 0.01239158, "auxiliary_loss_mlp": 0.01041953, "balance_loss_clip": 1.06922078, "balance_loss_mlp": 1.03142071, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 1.8277312969692305, "language_loss": 0.8130101, "learning_rate": 3.8998375640777625e-06, "loss": 0.83582121, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.471086025238037 }, { "auxiliary_loss_clip": 0.01100713, "auxiliary_loss_mlp": 0.01003675, "balance_loss_clip": 1.03002143, "balance_loss_mlp": 1.00096846, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.7055097184507033, "language_loss": 0.52666527, "learning_rate": 3.899593993763229e-06, "loss": 0.54770911, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.065671443939209 }, { "auxiliary_loss_clip": 0.01185337, "auxiliary_loss_mlp": 0.01043484, "balance_loss_clip": 1.0615418, "balance_loss_mlp": 1.03236735, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 2.905354981767814, "language_loss": 0.81025183, "learning_rate": 3.899350135281796e-06, "loss": 0.83254004, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.6390085220336914 }, { "auxiliary_loss_clip": 0.01198111, "auxiliary_loss_mlp": 0.010461, "balance_loss_clip": 1.06490064, "balance_loss_mlp": 1.03622901, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 2.030057409363176, "language_loss": 0.79829299, "learning_rate": 3.8991059886704585e-06, "loss": 0.8207351, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.5977070331573486 }, { "auxiliary_loss_clip": 0.01187355, "auxiliary_loss_mlp": 0.01042341, "balance_loss_clip": 1.06336772, "balance_loss_mlp": 1.03122497, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 1.9402292555019003, "language_loss": 0.83154994, "learning_rate": 3.898861553966252e-06, "loss": 0.85384691, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.651322603225708 }, { "auxiliary_loss_clip": 0.01145557, "auxiliary_loss_mlp": 0.01048347, "balance_loss_clip": 1.05353045, "balance_loss_mlp": 1.03677213, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 1.8593134196282446, "language_loss": 0.88117647, "learning_rate": 3.898616831206257e-06, "loss": 0.90311563, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 2.7072017192840576 }, { "auxiliary_loss_clip": 0.01187902, "auxiliary_loss_mlp": 0.0104513, "balance_loss_clip": 1.05970454, "balance_loss_mlp": 1.03456807, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 2.5660865153847987, "language_loss": 0.76886845, "learning_rate": 3.8983718204276e-06, "loss": 0.79119885, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 2.6430046558380127 }, { "auxiliary_loss_clip": 0.01207625, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.06569839, "balance_loss_mlp": 1.03885889, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 1.7357677556958448, "language_loss": 0.82732749, "learning_rate": 3.898126521667446e-06, "loss": 0.84989381, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.5688769817352295 }, { "auxiliary_loss_clip": 0.01225172, "auxiliary_loss_mlp": 0.01042169, "balance_loss_clip": 1.06762969, "balance_loss_mlp": 1.03099346, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 1.6040887869025755, "language_loss": 0.8330344, "learning_rate": 3.897880934963007e-06, "loss": 0.85570788, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.5432510375976562 }, { "auxiliary_loss_clip": 0.01203278, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.06255579, "balance_loss_mlp": 1.03034496, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 2.223900936652385, "language_loss": 0.7885235, "learning_rate": 3.89763506035154e-06, "loss": 0.8109622, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.55460524559021 }, { "auxiliary_loss_clip": 0.01208227, "auxiliary_loss_mlp": 0.01042921, "balance_loss_clip": 1.06471598, "balance_loss_mlp": 1.03299665, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 1.9846482448184568, "language_loss": 0.81389189, "learning_rate": 3.897388897870343e-06, "loss": 0.83640337, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.5815446376800537 }, { "auxiliary_loss_clip": 0.01218741, "auxiliary_loss_mlp": 0.01038614, "balance_loss_clip": 1.06675339, "balance_loss_mlp": 1.02762282, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 2.010272583759834, "language_loss": 0.74907184, "learning_rate": 3.89714244755676e-06, "loss": 0.77164537, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.6025137901306152 }, { "auxiliary_loss_clip": 0.01158042, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.0539825, "balance_loss_mlp": 1.03581285, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 2.788753932859496, "language_loss": 0.86120832, "learning_rate": 3.896895709448175e-06, "loss": 0.88325381, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.6355793476104736 }, { "auxiliary_loss_clip": 0.01150097, "auxiliary_loss_mlp": 0.01039747, "balance_loss_clip": 1.05335402, "balance_loss_mlp": 1.0289706, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 2.506821613467787, "language_loss": 0.7715832, "learning_rate": 3.896648683582019e-06, "loss": 0.79348165, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.6121151447296143 }, { "auxiliary_loss_clip": 0.01175785, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.06349158, "balance_loss_mlp": 1.0271368, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 2.077727509405393, "language_loss": 0.81146133, "learning_rate": 3.896401369995766e-06, "loss": 0.83359259, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.652203321456909 }, { "auxiliary_loss_clip": 0.01235741, "auxiliary_loss_mlp": 0.01048084, "balance_loss_clip": 1.0676527, "balance_loss_mlp": 1.03718805, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 1.8472068778746615, "language_loss": 0.79423982, "learning_rate": 3.896153768726932e-06, "loss": 0.81707811, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.5074172019958496 }, { "auxiliary_loss_clip": 0.01219518, "auxiliary_loss_mlp": 0.01041225, "balance_loss_clip": 1.06879735, "balance_loss_mlp": 1.03098512, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 3.457752169268043, "language_loss": 0.87808055, "learning_rate": 3.8959058798130806e-06, "loss": 0.90068793, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.500200033187866 }, { "auxiliary_loss_clip": 0.01204605, "auxiliary_loss_mlp": 0.00764392, "balance_loss_clip": 1.06292176, "balance_loss_mlp": 1.00007427, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 1.671046759421625, "language_loss": 0.7482087, "learning_rate": 3.895657703291814e-06, "loss": 0.76789868, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 2.5692949295043945 }, { "auxiliary_loss_clip": 0.01213587, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 1.06223917, "balance_loss_mlp": 1.02898169, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 3.09108124320713, "language_loss": 0.78934336, "learning_rate": 3.895409239200781e-06, "loss": 0.81188869, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 3.2918388843536377 }, { "auxiliary_loss_clip": 0.01214201, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.06372011, "balance_loss_mlp": 1.02671838, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 2.6420160555123697, "language_loss": 0.91216254, "learning_rate": 3.895160487577673e-06, "loss": 0.93468016, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 2.5015358924865723 }, { "auxiliary_loss_clip": 0.01117955, "auxiliary_loss_mlp": 0.01003347, "balance_loss_clip": 1.0354588, "balance_loss_mlp": 1.0004977, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7826108519944255, "language_loss": 0.60902047, "learning_rate": 3.894911448460226e-06, "loss": 0.63023353, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 3.7135608196258545 }, { "auxiliary_loss_clip": 0.01126396, "auxiliary_loss_mlp": 0.01039791, "balance_loss_clip": 1.05444038, "balance_loss_mlp": 1.02948511, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 3.093885844408734, "language_loss": 0.72794288, "learning_rate": 3.8946621218862195e-06, "loss": 0.7496047, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 3.542111396789551 }, { "auxiliary_loss_clip": 0.01187018, "auxiliary_loss_mlp": 0.01045685, "balance_loss_clip": 1.06092954, "balance_loss_mlp": 1.0347594, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 2.0505530708368536, "language_loss": 0.89069784, "learning_rate": 3.894412507893475e-06, "loss": 0.91302478, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 2.645631790161133 }, { "auxiliary_loss_clip": 0.01181856, "auxiliary_loss_mlp": 0.01041655, "balance_loss_clip": 1.06087172, "balance_loss_mlp": 1.03062272, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 1.997153205952805, "language_loss": 0.72069687, "learning_rate": 3.894162606519859e-06, "loss": 0.74293196, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 2.639298439025879 }, { "auxiliary_loss_clip": 0.01170312, "auxiliary_loss_mlp": 0.01033489, "balance_loss_clip": 1.05938792, "balance_loss_mlp": 1.02311826, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 2.0633290717525665, "language_loss": 0.77350956, "learning_rate": 3.893912417803282e-06, "loss": 0.7955476, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 3.3392324447631836 }, { "auxiliary_loss_clip": 0.01169929, "auxiliary_loss_mlp": 0.01039163, "balance_loss_clip": 1.05380213, "balance_loss_mlp": 1.02805877, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 1.8559563495343867, "language_loss": 0.76453829, "learning_rate": 3.8936619417816975e-06, "loss": 0.7866292, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.6592886447906494 }, { "auxiliary_loss_clip": 0.01191227, "auxiliary_loss_mlp": 0.01049684, "balance_loss_clip": 1.06508636, "balance_loss_mlp": 1.03860319, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 1.8131272923169133, "language_loss": 0.71543336, "learning_rate": 3.8934111784931015e-06, "loss": 0.73784244, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.555454730987549 }, { "auxiliary_loss_clip": 0.01104841, "auxiliary_loss_mlp": 0.0100439, "balance_loss_clip": 1.03043973, "balance_loss_mlp": 1.0016129, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 0.990191517391737, "language_loss": 0.59061873, "learning_rate": 3.893160127975535e-06, "loss": 0.61171103, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.2444469928741455 }, { "auxiliary_loss_clip": 0.01180352, "auxiliary_loss_mlp": 0.01038853, "balance_loss_clip": 1.05835319, "balance_loss_mlp": 1.02877378, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.2641655821412545, "language_loss": 0.81359255, "learning_rate": 3.8929087902670826e-06, "loss": 0.83578455, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 2.8174192905426025 }, { "auxiliary_loss_clip": 0.01115483, "auxiliary_loss_mlp": 0.01003374, "balance_loss_clip": 1.0285337, "balance_loss_mlp": 1.0005368, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.9259716268126945, "language_loss": 0.60692918, "learning_rate": 3.8926571654058715e-06, "loss": 0.6281178, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.031236171722412 }, { "auxiliary_loss_clip": 0.01188913, "auxiliary_loss_mlp": 0.01041946, "balance_loss_clip": 1.06352246, "balance_loss_mlp": 1.03181326, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 3.8247342624576754, "language_loss": 0.77083004, "learning_rate": 3.892405253430074e-06, "loss": 0.79313862, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.707878351211548 }, { "auxiliary_loss_clip": 0.0121278, "auxiliary_loss_mlp": 0.00764253, "balance_loss_clip": 1.06629777, "balance_loss_mlp": 1.00004077, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 2.432152014739372, "language_loss": 0.82273233, "learning_rate": 3.892153054377904e-06, "loss": 0.84250265, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.5972392559051514 }, { "auxiliary_loss_clip": 0.01054078, "auxiliary_loss_mlp": 0.0100616, "balance_loss_clip": 1.02388835, "balance_loss_mlp": 1.00325131, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9991126831087038, "language_loss": 0.59444469, "learning_rate": 3.891900568287619e-06, "loss": 0.61504704, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.1282007694244385 }, { "auxiliary_loss_clip": 0.01192207, "auxiliary_loss_mlp": 0.01040911, "balance_loss_clip": 1.05981302, "balance_loss_mlp": 1.0293839, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.707852829563897, "language_loss": 0.72586048, "learning_rate": 3.891647795197523e-06, "loss": 0.7481916, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.841768741607666 }, { "auxiliary_loss_clip": 0.01194541, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.05951071, "balance_loss_mlp": 1.03201747, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 2.1859448089054485, "language_loss": 0.68508744, "learning_rate": 3.8913947351459605e-06, "loss": 0.70746136, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.5741093158721924 }, { "auxiliary_loss_clip": 0.01236321, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.06783998, "balance_loss_mlp": 1.02913857, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 1.8962882495268178, "language_loss": 0.67583054, "learning_rate": 3.89114138817132e-06, "loss": 0.6985817, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.493234634399414 }, { "auxiliary_loss_clip": 0.01219882, "auxiliary_loss_mlp": 0.01041359, "balance_loss_clip": 1.06832194, "balance_loss_mlp": 1.03082693, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 1.9517497232671868, "language_loss": 0.84359211, "learning_rate": 3.890887754312035e-06, "loss": 0.8662045, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.4972715377807617 }, { "auxiliary_loss_clip": 0.01199824, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.06074739, "balance_loss_mlp": 1.02841735, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 1.7829331486085258, "language_loss": 0.87633824, "learning_rate": 3.890633833606581e-06, "loss": 0.89872074, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.5372328758239746 }, { "auxiliary_loss_clip": 0.01222158, "auxiliary_loss_mlp": 0.01041444, "balance_loss_clip": 1.07158792, "balance_loss_mlp": 1.03104854, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 1.7276098298992784, "language_loss": 0.69412512, "learning_rate": 3.890379626093477e-06, "loss": 0.71676111, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.506258487701416 }, { "auxiliary_loss_clip": 0.0116264, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.05792308, "balance_loss_mlp": 1.02291548, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 2.8742001804637454, "language_loss": 0.92538899, "learning_rate": 3.890125131811287e-06, "loss": 0.94734442, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.6051981449127197 }, { "auxiliary_loss_clip": 0.01189605, "auxiliary_loss_mlp": 0.01037476, "balance_loss_clip": 1.05710018, "balance_loss_mlp": 1.02773702, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 2.2387493368120492, "language_loss": 0.75461066, "learning_rate": 3.889870350798618e-06, "loss": 0.77688146, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.520110607147217 }, { "auxiliary_loss_clip": 0.01234204, "auxiliary_loss_mlp": 0.01039318, "balance_loss_clip": 1.06679511, "balance_loss_mlp": 1.02877378, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 1.5903168467201718, "language_loss": 0.78506261, "learning_rate": 3.889615283094119e-06, "loss": 0.80779779, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.4824838638305664 }, { "auxiliary_loss_clip": 0.01239381, "auxiliary_loss_mlp": 0.01042967, "balance_loss_clip": 1.06683755, "balance_loss_mlp": 1.03141594, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 1.991906230940626, "language_loss": 0.84380919, "learning_rate": 3.889359928736485e-06, "loss": 0.8666327, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.462399959564209 }, { "auxiliary_loss_clip": 0.01198381, "auxiliary_loss_mlp": 0.00764162, "balance_loss_clip": 1.06358278, "balance_loss_mlp": 1.00002074, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 2.165911575523885, "language_loss": 0.90957469, "learning_rate": 3.889104287764451e-06, "loss": 0.92920011, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 2.594787836074829 }, { "auxiliary_loss_clip": 0.0120574, "auxiliary_loss_mlp": 0.01041373, "balance_loss_clip": 1.06926918, "balance_loss_mlp": 1.03079331, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 1.9917991605200263, "language_loss": 0.90840089, "learning_rate": 3.888848360216798e-06, "loss": 0.93087208, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 3.3396666049957275 }, { "auxiliary_loss_clip": 0.01100919, "auxiliary_loss_mlp": 0.01006612, "balance_loss_clip": 1.02441216, "balance_loss_mlp": 1.00371504, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 0.7895816040523768, "language_loss": 0.56576914, "learning_rate": 3.888592146132351e-06, "loss": 0.5868445, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 3.2206993103027344 }, { "auxiliary_loss_clip": 0.01221945, "auxiliary_loss_mlp": 0.01036491, "balance_loss_clip": 1.0686655, "balance_loss_mlp": 1.02615571, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 2.083562897806881, "language_loss": 0.78610349, "learning_rate": 3.888335645549978e-06, "loss": 0.80868787, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 3.315871238708496 }, { "auxiliary_loss_clip": 0.01238626, "auxiliary_loss_mlp": 0.01042496, "balance_loss_clip": 1.07068491, "balance_loss_mlp": 1.03269696, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 2.3037942782484135, "language_loss": 0.81557512, "learning_rate": 3.888078858508588e-06, "loss": 0.83838642, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 3.2923686504364014 }, { "auxiliary_loss_clip": 0.01205822, "auxiliary_loss_mlp": 0.01041946, "balance_loss_clip": 1.0692637, "balance_loss_mlp": 1.03245056, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 1.6757867007411082, "language_loss": 0.84435284, "learning_rate": 3.8878217850471365e-06, "loss": 0.86683047, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 2.63095760345459 }, { "auxiliary_loss_clip": 0.01243009, "auxiliary_loss_mlp": 0.01048852, "balance_loss_clip": 1.07256651, "balance_loss_mlp": 1.03854585, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 1.8682284066975048, "language_loss": 0.73856801, "learning_rate": 3.887564425204621e-06, "loss": 0.76148665, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 2.524057626724243 }, { "auxiliary_loss_clip": 0.01082079, "auxiliary_loss_mlp": 0.01002566, "balance_loss_clip": 1.02439404, "balance_loss_mlp": 0.9995262, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8576483780228971, "language_loss": 0.54734468, "learning_rate": 3.887306779020083e-06, "loss": 0.56819117, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.838433265686035 }, { "auxiliary_loss_clip": 0.01225783, "auxiliary_loss_mlp": 0.01040903, "balance_loss_clip": 1.06887877, "balance_loss_mlp": 1.0300138, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.3780901386180773, "language_loss": 0.70458007, "learning_rate": 3.887048846532608e-06, "loss": 0.727247, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.5200228691101074 }, { "auxiliary_loss_clip": 0.01082167, "auxiliary_loss_mlp": 0.01003955, "balance_loss_clip": 1.020033, "balance_loss_mlp": 1.00120151, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7615194366654615, "language_loss": 0.5816226, "learning_rate": 3.8867906277813224e-06, "loss": 0.60248387, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.0330288410186768 }, { "auxiliary_loss_clip": 0.01228704, "auxiliary_loss_mlp": 0.00763465, "balance_loss_clip": 1.07133436, "balance_loss_mlp": 1.00006819, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 2.2327492687396324, "language_loss": 0.73994875, "learning_rate": 3.886532122805399e-06, "loss": 0.75987041, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.670726776123047 }, { "auxiliary_loss_clip": 0.01143759, "auxiliary_loss_mlp": 0.01040175, "balance_loss_clip": 1.05410326, "balance_loss_mlp": 1.02834988, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 2.035678919470623, "language_loss": 0.896725, "learning_rate": 3.886273331644053e-06, "loss": 0.91856432, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.6622493267059326 }, { "auxiliary_loss_clip": 0.01176451, "auxiliary_loss_mlp": 0.01038946, "balance_loss_clip": 1.06515288, "balance_loss_mlp": 1.02870011, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 2.230809638794972, "language_loss": 0.82186097, "learning_rate": 3.886014254336542e-06, "loss": 0.84401494, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.580785036087036 }, { "auxiliary_loss_clip": 0.01217711, "auxiliary_loss_mlp": 0.0103449, "balance_loss_clip": 1.06384087, "balance_loss_mlp": 1.02427387, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.7950515985489024, "language_loss": 0.9277209, "learning_rate": 3.885754890922168e-06, "loss": 0.95024294, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.546475410461426 }, { "auxiliary_loss_clip": 0.01134924, "auxiliary_loss_mlp": 0.01034934, "balance_loss_clip": 1.05812025, "balance_loss_mlp": 1.02390158, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 1.7578511347297217, "language_loss": 0.78192395, "learning_rate": 3.885495241440277e-06, "loss": 0.80362248, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 2.9330544471740723 }, { "auxiliary_loss_clip": 0.01238627, "auxiliary_loss_mlp": 0.01042118, "balance_loss_clip": 1.06967354, "balance_loss_mlp": 1.03066242, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 1.7088415671969657, "language_loss": 0.74298728, "learning_rate": 3.885235305930257e-06, "loss": 0.76579475, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.775200366973877 }, { "auxiliary_loss_clip": 0.01190843, "auxiliary_loss_mlp": 0.01048259, "balance_loss_clip": 1.06791425, "balance_loss_mlp": 1.03707731, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 1.9254887649025159, "language_loss": 0.85758221, "learning_rate": 3.884975084431539e-06, "loss": 0.87997329, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.5701098442077637 }, { "auxiliary_loss_clip": 0.01214913, "auxiliary_loss_mlp": 0.0076362, "balance_loss_clip": 1.06769311, "balance_loss_mlp": 1.00002646, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 2.7822546174349254, "language_loss": 0.9173485, "learning_rate": 3.8847145769836e-06, "loss": 0.93713379, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.4926645755767822 }, { "auxiliary_loss_clip": 0.01243262, "auxiliary_loss_mlp": 0.01044506, "balance_loss_clip": 1.07253468, "balance_loss_mlp": 1.03313386, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 3.0543462769511254, "language_loss": 0.66461194, "learning_rate": 3.884453783625959e-06, "loss": 0.68748963, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.468153715133667 }, { "auxiliary_loss_clip": 0.01202113, "auxiliary_loss_mlp": 0.01032469, "balance_loss_clip": 1.06510186, "balance_loss_mlp": 1.02213311, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 2.851723123638947, "language_loss": 0.84840971, "learning_rate": 3.884192704398176e-06, "loss": 0.87075555, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.5372021198272705 }, { "auxiliary_loss_clip": 0.01226204, "auxiliary_loss_mlp": 0.01043395, "balance_loss_clip": 1.06909537, "balance_loss_mlp": 1.03320897, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 1.6816622764400155, "language_loss": 0.74245942, "learning_rate": 3.883931339339858e-06, "loss": 0.76515543, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 2.783559799194336 }, { "auxiliary_loss_clip": 0.01230181, "auxiliary_loss_mlp": 0.01036587, "balance_loss_clip": 1.06877148, "balance_loss_mlp": 1.02492785, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 2.87518929652613, "language_loss": 0.78960055, "learning_rate": 3.883669688490654e-06, "loss": 0.81226814, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.5040786266326904 }, { "auxiliary_loss_clip": 0.01196868, "auxiliary_loss_mlp": 0.00763939, "balance_loss_clip": 1.06340885, "balance_loss_mlp": 1.00002766, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 1.760592807308845, "language_loss": 0.85254437, "learning_rate": 3.883407751890256e-06, "loss": 0.87215245, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.540595531463623 }, { "auxiliary_loss_clip": 0.01189569, "auxiliary_loss_mlp": 0.01053139, "balance_loss_clip": 1.0616672, "balance_loss_mlp": 1.04134369, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 1.7244084684688303, "language_loss": 0.85677087, "learning_rate": 3.8831455295783994e-06, "loss": 0.87919796, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.668065071105957 }, { "auxiliary_loss_clip": 0.01199704, "auxiliary_loss_mlp": 0.01041149, "balance_loss_clip": 1.06322074, "balance_loss_mlp": 1.03071237, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 1.6932830496390316, "language_loss": 0.74116409, "learning_rate": 3.882883021594864e-06, "loss": 0.76357269, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.5827267169952393 }, { "auxiliary_loss_clip": 0.01186156, "auxiliary_loss_mlp": 0.01039199, "balance_loss_clip": 1.06685448, "balance_loss_mlp": 1.0286665, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 2.133726437370247, "language_loss": 0.86785662, "learning_rate": 3.8826202279794705e-06, "loss": 0.89011019, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.5417089462280273 }, { "auxiliary_loss_clip": 0.01244427, "auxiliary_loss_mlp": 0.01038021, "balance_loss_clip": 1.07556391, "balance_loss_mlp": 1.02693439, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 1.9784777363543373, "language_loss": 0.69955075, "learning_rate": 3.882357148772085e-06, "loss": 0.72237521, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 2.4768245220184326 }, { "auxiliary_loss_clip": 0.01173109, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.06081605, "balance_loss_mlp": 1.02534878, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 2.1685532595731543, "language_loss": 0.84322059, "learning_rate": 3.882093784012617e-06, "loss": 0.86531359, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 3.279172420501709 }, { "auxiliary_loss_clip": 0.01205418, "auxiliary_loss_mlp": 0.01042745, "balance_loss_clip": 1.06602895, "balance_loss_mlp": 1.03200424, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 3.016829545914213, "language_loss": 0.84346777, "learning_rate": 3.881830133741019e-06, "loss": 0.86594939, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 2.554168224334717 }, { "auxiliary_loss_clip": 0.01190138, "auxiliary_loss_mlp": 0.01041915, "balance_loss_clip": 1.06542563, "balance_loss_mlp": 1.03112113, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 2.25583779416728, "language_loss": 0.76518047, "learning_rate": 3.881566197997285e-06, "loss": 0.78750098, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 3.2935361862182617 }, { "auxiliary_loss_clip": 0.0120532, "auxiliary_loss_mlp": 0.01034923, "balance_loss_clip": 1.07150459, "balance_loss_mlp": 1.02599454, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 1.5380594193272303, "language_loss": 0.74724793, "learning_rate": 3.881301976821456e-06, "loss": 0.76965034, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 2.569831132888794 }, { "auxiliary_loss_clip": 0.01220437, "auxiliary_loss_mlp": 0.01055003, "balance_loss_clip": 1.07040215, "balance_loss_mlp": 1.04451299, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 1.974003263037538, "language_loss": 0.90546453, "learning_rate": 3.881037470253612e-06, "loss": 0.9282189, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 3.2378902435302734 }, { "auxiliary_loss_clip": 0.01173965, "auxiliary_loss_mlp": 0.01041635, "balance_loss_clip": 1.0633359, "balance_loss_mlp": 1.03121591, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.551260192676724, "language_loss": 0.79472005, "learning_rate": 3.88077267833388e-06, "loss": 0.81687605, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.607044219970703 }, { "auxiliary_loss_clip": 0.01168256, "auxiliary_loss_mlp": 0.01045528, "balance_loss_clip": 1.06044364, "balance_loss_mlp": 1.03499007, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.4540920970050055, "language_loss": 0.83946609, "learning_rate": 3.880507601102427e-06, "loss": 0.86160398, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 3.3360414505004883 }, { "auxiliary_loss_clip": 0.01239483, "auxiliary_loss_mlp": 0.0104365, "balance_loss_clip": 1.07405138, "balance_loss_mlp": 1.0329864, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 1.9891838609193278, "language_loss": 0.82009637, "learning_rate": 3.880242238599467e-06, "loss": 0.84292769, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.464167594909668 }, { "auxiliary_loss_clip": 0.01237853, "auxiliary_loss_mlp": 0.01044802, "balance_loss_clip": 1.07210159, "balance_loss_mlp": 1.03452611, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 1.810489523737688, "language_loss": 0.83302999, "learning_rate": 3.879976590865254e-06, "loss": 0.85585654, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.5151267051696777 }, { "auxiliary_loss_clip": 0.01206537, "auxiliary_loss_mlp": 0.01043413, "balance_loss_clip": 1.06874466, "balance_loss_mlp": 1.03367376, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 2.4492752068655173, "language_loss": 0.87256116, "learning_rate": 3.879710657940087e-06, "loss": 0.89506072, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.5684032440185547 }, { "auxiliary_loss_clip": 0.01226121, "auxiliary_loss_mlp": 0.01045909, "balance_loss_clip": 1.07200193, "balance_loss_mlp": 1.03479266, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 1.8142344330606872, "language_loss": 0.70044303, "learning_rate": 3.879444439864308e-06, "loss": 0.72316331, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.6162970066070557 }, { "auxiliary_loss_clip": 0.01223864, "auxiliary_loss_mlp": 0.00764626, "balance_loss_clip": 1.06775951, "balance_loss_mlp": 1.00007868, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 1.8247518730885273, "language_loss": 0.85947877, "learning_rate": 3.879177936678301e-06, "loss": 0.87936372, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.558215379714966 }, { "auxiliary_loss_clip": 0.01211205, "auxiliary_loss_mlp": 0.01051362, "balance_loss_clip": 1.06756783, "balance_loss_mlp": 1.04016256, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 2.016657929155821, "language_loss": 0.7714982, "learning_rate": 3.878911148422496e-06, "loss": 0.79412389, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.679152727127075 }, { "auxiliary_loss_clip": 0.0122311, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.06740451, "balance_loss_mlp": 1.02801871, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 2.0965841062838457, "language_loss": 0.70386118, "learning_rate": 3.878644075137364e-06, "loss": 0.72648525, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.603072166442871 }, { "auxiliary_loss_clip": 0.01168865, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.05771565, "balance_loss_mlp": 1.02698135, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 1.9303235662594027, "language_loss": 0.79020268, "learning_rate": 3.878376716863418e-06, "loss": 0.81226492, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.568640947341919 }, { "auxiliary_loss_clip": 0.01208529, "auxiliary_loss_mlp": 0.01043499, "balance_loss_clip": 1.0686388, "balance_loss_mlp": 1.03267467, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 2.1386331431536547, "language_loss": 0.71652067, "learning_rate": 3.878109073641219e-06, "loss": 0.73904097, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.5820083618164062 }, { "auxiliary_loss_clip": 0.01171966, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.06081545, "balance_loss_mlp": 1.02931511, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 1.5526199993874867, "language_loss": 0.81295621, "learning_rate": 3.877841145511366e-06, "loss": 0.83507365, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.6953811645507812 }, { "auxiliary_loss_clip": 0.0122924, "auxiliary_loss_mlp": 0.01046464, "balance_loss_clip": 1.07070696, "balance_loss_mlp": 1.03554416, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 1.7498995785154863, "language_loss": 0.82878846, "learning_rate": 3.8775729325145035e-06, "loss": 0.85154557, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.505779981613159 }, { "auxiliary_loss_clip": 0.01074198, "auxiliary_loss_mlp": 0.0100266, "balance_loss_clip": 1.02419162, "balance_loss_mlp": 0.99961996, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.7906288474545474, "language_loss": 0.64768398, "learning_rate": 3.877304434691321e-06, "loss": 0.6684525, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.3098843097686768 }, { "auxiliary_loss_clip": 0.01190417, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.06470239, "balance_loss_mlp": 1.03418803, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.728192063580859, "language_loss": 0.79743266, "learning_rate": 3.877035652082548e-06, "loss": 0.81977856, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.687314510345459 }, { "auxiliary_loss_clip": 0.01196053, "auxiliary_loss_mlp": 0.01045991, "balance_loss_clip": 1.06727493, "balance_loss_mlp": 1.0335989, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.8732829073156738, "language_loss": 0.85319138, "learning_rate": 3.87676658472896e-06, "loss": 0.87561178, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.5447165966033936 }, { "auxiliary_loss_clip": 0.01219975, "auxiliary_loss_mlp": 0.01036379, "balance_loss_clip": 1.06328905, "balance_loss_mlp": 1.02615714, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 1.922781142927629, "language_loss": 0.85300469, "learning_rate": 3.876497232671372e-06, "loss": 0.87556827, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.5204851627349854 }, { "auxiliary_loss_clip": 0.01177775, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.05973625, "balance_loss_mlp": 1.02524543, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 2.1414461794580153, "language_loss": 0.83581114, "learning_rate": 3.876227595950647e-06, "loss": 0.85795063, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 2.6877052783966064 }, { "auxiliary_loss_clip": 0.01239745, "auxiliary_loss_mlp": 0.01036426, "balance_loss_clip": 1.07319832, "balance_loss_mlp": 1.0257926, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.7322000058818168, "language_loss": 0.78887689, "learning_rate": 3.875957674607686e-06, "loss": 0.81163859, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.5359959602355957 }, { "auxiliary_loss_clip": 0.01211797, "auxiliary_loss_mlp": 0.00764896, "balance_loss_clip": 1.06291389, "balance_loss_mlp": 1.00008035, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 1.9577591347752015, "language_loss": 0.87827551, "learning_rate": 3.8756874686834386e-06, "loss": 0.89804244, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 2.481790542602539 }, { "auxiliary_loss_clip": 0.01228954, "auxiliary_loss_mlp": 0.00764404, "balance_loss_clip": 1.06827223, "balance_loss_mlp": 1.00008917, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.7368458239657774, "language_loss": 0.80511677, "learning_rate": 3.875416978218893e-06, "loss": 0.82505035, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.620042562484741 }, { "auxiliary_loss_clip": 0.0119759, "auxiliary_loss_mlp": 0.01044747, "balance_loss_clip": 1.05987275, "balance_loss_mlp": 1.03351712, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 2.4549737725805496, "language_loss": 0.83208954, "learning_rate": 3.8751462032550835e-06, "loss": 0.85451293, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 3.3946242332458496 }, { "auxiliary_loss_clip": 0.01206955, "auxiliary_loss_mlp": 0.01042705, "balance_loss_clip": 1.07017696, "balance_loss_mlp": 1.0326314, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 3.4371805308696306, "language_loss": 0.83049047, "learning_rate": 3.874875143833085e-06, "loss": 0.85298711, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 2.5443012714385986 }, { "auxiliary_loss_clip": 0.01226824, "auxiliary_loss_mlp": 0.01041089, "balance_loss_clip": 1.07003617, "balance_loss_mlp": 1.02994871, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 1.7728770708954875, "language_loss": 0.68531573, "learning_rate": 3.874603799994019e-06, "loss": 0.70799482, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 2.8329520225524902 }, { "auxiliary_loss_clip": 0.01184171, "auxiliary_loss_mlp": 0.01040275, "balance_loss_clip": 1.06203735, "balance_loss_mlp": 1.03094649, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 2.163132056055306, "language_loss": 0.8711552, "learning_rate": 3.874332171779046e-06, "loss": 0.89339966, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 3.364712953567505 }, { "auxiliary_loss_clip": 0.01186474, "auxiliary_loss_mlp": 0.01043632, "balance_loss_clip": 1.06126451, "balance_loss_mlp": 1.03299832, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 1.8784991732215723, "language_loss": 0.75701189, "learning_rate": 3.874060259229373e-06, "loss": 0.77931297, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 3.3699495792388916 }, { "auxiliary_loss_clip": 0.01227501, "auxiliary_loss_mlp": 0.01046504, "balance_loss_clip": 1.07141829, "balance_loss_mlp": 1.03630543, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.122148924545455, "language_loss": 0.93754208, "learning_rate": 3.873788062386249e-06, "loss": 0.96028209, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 3.337472677230835 }, { "auxiliary_loss_clip": 0.0120268, "auxiliary_loss_mlp": 0.01044965, "balance_loss_clip": 1.07090044, "balance_loss_mlp": 1.03410494, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 3.0740757612724146, "language_loss": 0.81968713, "learning_rate": 3.873515581290965e-06, "loss": 0.84216356, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.6590356826782227 }, { "auxiliary_loss_clip": 0.0119537, "auxiliary_loss_mlp": 0.01041121, "balance_loss_clip": 1.07136595, "balance_loss_mlp": 1.03074372, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 2.223487051503957, "language_loss": 0.75664914, "learning_rate": 3.8732428159848575e-06, "loss": 0.77901399, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.583380699157715 }, { "auxiliary_loss_clip": 0.01223989, "auxiliary_loss_mlp": 0.01040391, "balance_loss_clip": 1.07267499, "balance_loss_mlp": 1.02951908, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.9388476739443976, "language_loss": 0.78290719, "learning_rate": 3.872969766509304e-06, "loss": 0.80555099, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.564643621444702 }, { "auxiliary_loss_clip": 0.01077592, "auxiliary_loss_mlp": 0.01010244, "balance_loss_clip": 1.02460599, "balance_loss_mlp": 1.00743091, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.7649416942154666, "language_loss": 0.55689424, "learning_rate": 3.872696432905726e-06, "loss": 0.57777262, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.155531167984009 }, { "auxiliary_loss_clip": 0.01228327, "auxiliary_loss_mlp": 0.01043899, "balance_loss_clip": 1.06894374, "balance_loss_mlp": 1.03276491, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 2.296491865698173, "language_loss": 0.7160061, "learning_rate": 3.872422815215589e-06, "loss": 0.73872828, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.5400092601776123 }, { "auxiliary_loss_clip": 0.01215221, "auxiliary_loss_mlp": 0.01044628, "balance_loss_clip": 1.06396461, "balance_loss_mlp": 1.03275454, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 1.99115911783194, "language_loss": 0.74266034, "learning_rate": 3.8721489134803994e-06, "loss": 0.76525885, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.524367332458496 }, { "auxiliary_loss_clip": 0.01218942, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.06684709, "balance_loss_mlp": 1.03840971, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.790371512309975, "language_loss": 0.7261548, "learning_rate": 3.871874727741707e-06, "loss": 0.74883389, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.490488290786743 }, { "auxiliary_loss_clip": 0.0122222, "auxiliary_loss_mlp": 0.01044657, "balance_loss_clip": 1.0707432, "balance_loss_mlp": 1.03511441, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 1.8734726218812983, "language_loss": 0.96573216, "learning_rate": 3.871600258041108e-06, "loss": 0.98840094, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.518188714981079 }, { "auxiliary_loss_clip": 0.01204978, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.06504822, "balance_loss_mlp": 1.026088, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 10.392965848892324, "language_loss": 0.85461283, "learning_rate": 3.871325504420238e-06, "loss": 0.87703192, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.5256526470184326 }, { "auxiliary_loss_clip": 0.01239771, "auxiliary_loss_mlp": 0.01038329, "balance_loss_clip": 1.07256269, "balance_loss_mlp": 1.02729678, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 2.3339663899710685, "language_loss": 0.8182168, "learning_rate": 3.871050466920776e-06, "loss": 0.84099782, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.5006301403045654 }, { "auxiliary_loss_clip": 0.01181739, "auxiliary_loss_mlp": 0.01043194, "balance_loss_clip": 1.06085324, "balance_loss_mlp": 1.03341854, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 2.1126768482357976, "language_loss": 0.80001688, "learning_rate": 3.870775145584447e-06, "loss": 0.82226622, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.5550267696380615 }, { "auxiliary_loss_clip": 0.01216423, "auxiliary_loss_mlp": 0.01051215, "balance_loss_clip": 1.06946087, "balance_loss_mlp": 1.038221, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 3.800001600348268, "language_loss": 0.65203154, "learning_rate": 3.8704995404530145e-06, "loss": 0.67470789, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.550929546356201 }, { "auxiliary_loss_clip": 0.01237087, "auxiliary_loss_mlp": 0.01049896, "balance_loss_clip": 1.07175148, "balance_loss_mlp": 1.04045486, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 1.9055131790722095, "language_loss": 0.85245156, "learning_rate": 3.87022365156829e-06, "loss": 0.87532133, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.509553909301758 }, { "auxiliary_loss_clip": 0.01141299, "auxiliary_loss_mlp": 0.01046825, "balance_loss_clip": 1.055902, "balance_loss_mlp": 1.03624487, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 1.9999384973730896, "language_loss": 0.81007594, "learning_rate": 3.869947478972123e-06, "loss": 0.83195716, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.699185609817505 }, { "auxiliary_loss_clip": 0.01212855, "auxiliary_loss_mlp": 0.01050564, "balance_loss_clip": 1.06427288, "balance_loss_mlp": 1.04030013, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 1.894264614422643, "language_loss": 0.82204187, "learning_rate": 3.869671022706412e-06, "loss": 0.84467614, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.5384719371795654 }, { "auxiliary_loss_clip": 0.01158059, "auxiliary_loss_mlp": 0.01040314, "balance_loss_clip": 1.05718136, "balance_loss_mlp": 1.03021061, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 1.8391047519957893, "language_loss": 0.64955759, "learning_rate": 3.869394282813092e-06, "loss": 0.67154127, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.763465404510498 }, { "auxiliary_loss_clip": 0.01197477, "auxiliary_loss_mlp": 0.01040825, "balance_loss_clip": 1.06077528, "balance_loss_mlp": 1.02858281, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 3.0974082004399066, "language_loss": 0.88988018, "learning_rate": 3.869117259334147e-06, "loss": 0.91226315, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.560513734817505 }, { "auxiliary_loss_clip": 0.01217706, "auxiliary_loss_mlp": 0.01048486, "balance_loss_clip": 1.06657434, "balance_loss_mlp": 1.03709614, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 1.9723263223924634, "language_loss": 0.82074761, "learning_rate": 3.868839952311599e-06, "loss": 0.84340954, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.5030486583709717 }, { "auxiliary_loss_clip": 0.011988, "auxiliary_loss_mlp": 0.01048922, "balance_loss_clip": 1.06278062, "balance_loss_mlp": 1.03792453, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 2.322310254770216, "language_loss": 0.80318534, "learning_rate": 3.868562361787516e-06, "loss": 0.82566249, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.5204532146453857 }, { "auxiliary_loss_clip": 0.0113869, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.0556227, "balance_loss_mlp": 1.02836561, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 1.8978966597112648, "language_loss": 0.68900943, "learning_rate": 3.868284487804009e-06, "loss": 0.71078324, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 3.555598258972168 }, { "auxiliary_loss_clip": 0.01210845, "auxiliary_loss_mlp": 0.01046049, "balance_loss_clip": 1.06583333, "balance_loss_mlp": 1.03534389, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.5425162109390098, "language_loss": 0.78048992, "learning_rate": 3.86800633040323e-06, "loss": 0.80305886, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 2.708998203277588 }, { "auxiliary_loss_clip": 0.01206242, "auxiliary_loss_mlp": 0.00763831, "balance_loss_clip": 1.06910992, "balance_loss_mlp": 1.00011086, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 2.072506443447748, "language_loss": 0.78509831, "learning_rate": 3.867727889627376e-06, "loss": 0.80479908, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 2.614508628845215 }, { "auxiliary_loss_clip": 0.01182641, "auxiliary_loss_mlp": 0.0104288, "balance_loss_clip": 1.06495619, "balance_loss_mlp": 1.03233027, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 2.6694291891790622, "language_loss": 0.78084159, "learning_rate": 3.867449165518687e-06, "loss": 0.80309689, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 3.3341403007507324 }, { "auxiliary_loss_clip": 0.01237701, "auxiliary_loss_mlp": 0.00764448, "balance_loss_clip": 1.0679363, "balance_loss_mlp": 1.00010705, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 1.8571396496057127, "language_loss": 0.71168935, "learning_rate": 3.867170158119444e-06, "loss": 0.73171085, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 3.2022206783294678 }, { "auxiliary_loss_clip": 0.01241313, "auxiliary_loss_mlp": 0.01045407, "balance_loss_clip": 1.07175863, "balance_loss_mlp": 1.03440404, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 1.8351944652002223, "language_loss": 0.75704867, "learning_rate": 3.866890867471972e-06, "loss": 0.77991587, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 2.501220703125 }, { "auxiliary_loss_clip": 0.01200005, "auxiliary_loss_mlp": 0.01055107, "balance_loss_clip": 1.05999422, "balance_loss_mlp": 1.04385984, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 2.85677477603376, "language_loss": 0.89934504, "learning_rate": 3.86661129361864e-06, "loss": 0.92189622, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 3.2081103324890137 }, { "auxiliary_loss_clip": 0.01201997, "auxiliary_loss_mlp": 0.01045438, "balance_loss_clip": 1.06405044, "balance_loss_mlp": 1.0346849, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 2.302127986164106, "language_loss": 0.85733926, "learning_rate": 3.866331436601859e-06, "loss": 0.87981355, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.53012752532959 }, { "auxiliary_loss_clip": 0.01239177, "auxiliary_loss_mlp": 0.01049949, "balance_loss_clip": 1.07220721, "balance_loss_mlp": 1.03891611, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 2.0616442689449, "language_loss": 0.74070966, "learning_rate": 3.866051296464083e-06, "loss": 0.76360089, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.470418930053711 }, { "auxiliary_loss_clip": 0.01236414, "auxiliary_loss_mlp": 0.00764295, "balance_loss_clip": 1.06728888, "balance_loss_mlp": 1.00009096, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 7.806885811676236, "language_loss": 0.85434055, "learning_rate": 3.86577087324781e-06, "loss": 0.87434769, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.462236166000366 }, { "auxiliary_loss_clip": 0.01220617, "auxiliary_loss_mlp": 0.01042788, "balance_loss_clip": 1.06998634, "balance_loss_mlp": 1.03215468, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 1.9418607803400916, "language_loss": 0.780321, "learning_rate": 3.865490166995578e-06, "loss": 0.80295503, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.4981062412261963 }, { "auxiliary_loss_clip": 0.0122065, "auxiliary_loss_mlp": 0.01036547, "balance_loss_clip": 1.06787896, "balance_loss_mlp": 1.02559149, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 2.1678999740330958, "language_loss": 0.83994919, "learning_rate": 3.86520917774997e-06, "loss": 0.86252117, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.5767323970794678 }, { "auxiliary_loss_clip": 0.01218813, "auxiliary_loss_mlp": 0.01036536, "balance_loss_clip": 1.0678308, "balance_loss_mlp": 1.02724934, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 7.437738764998897, "language_loss": 0.74914801, "learning_rate": 3.864927905553614e-06, "loss": 0.77170151, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.4848790168762207 }, { "auxiliary_loss_clip": 0.01188189, "auxiliary_loss_mlp": 0.01040452, "balance_loss_clip": 1.06330466, "balance_loss_mlp": 1.03073072, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 1.750155872338273, "language_loss": 0.89036977, "learning_rate": 3.8646463504491765e-06, "loss": 0.91265619, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.583354949951172 }, { "auxiliary_loss_clip": 0.01225046, "auxiliary_loss_mlp": 0.01042827, "balance_loss_clip": 1.07102084, "balance_loss_mlp": 1.03187132, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 1.8849858337742023, "language_loss": 0.83540046, "learning_rate": 3.8643645124793705e-06, "loss": 0.8580792, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.5331785678863525 }, { "auxiliary_loss_clip": 0.01218955, "auxiliary_loss_mlp": 0.01039641, "balance_loss_clip": 1.06797111, "balance_loss_mlp": 1.02948475, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.7525258514862025, "language_loss": 0.74943173, "learning_rate": 3.8640823916869515e-06, "loss": 0.77201772, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.73274827003479 }, { "auxiliary_loss_clip": 0.01238516, "auxiliary_loss_mlp": 0.01040989, "balance_loss_clip": 1.07061863, "balance_loss_mlp": 1.0304153, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 1.5101298519358748, "language_loss": 0.78354555, "learning_rate": 3.863799988114714e-06, "loss": 0.80634058, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.532618522644043 }, { "auxiliary_loss_clip": 0.01240154, "auxiliary_loss_mlp": 0.01049258, "balance_loss_clip": 1.07104063, "balance_loss_mlp": 1.03869033, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 2.358913495222585, "language_loss": 0.70430768, "learning_rate": 3.863517301805502e-06, "loss": 0.72720182, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.4636733531951904 }, { "auxiliary_loss_clip": 0.01189813, "auxiliary_loss_mlp": 0.01052736, "balance_loss_clip": 1.06480682, "balance_loss_mlp": 1.04174495, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 2.4236540586033355, "language_loss": 0.97206497, "learning_rate": 3.863234332802196e-06, "loss": 0.9944905, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.5376169681549072 }, { "auxiliary_loss_clip": 0.01205685, "auxiliary_loss_mlp": 0.01053954, "balance_loss_clip": 1.06634295, "balance_loss_mlp": 1.04424465, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 2.15950022428928, "language_loss": 0.73593235, "learning_rate": 3.862951081147723e-06, "loss": 0.75852877, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.6040773391723633 }, { "auxiliary_loss_clip": 0.01225383, "auxiliary_loss_mlp": 0.01029555, "balance_loss_clip": 1.07073784, "balance_loss_mlp": 1.02035797, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 2.21975543337457, "language_loss": 0.78156471, "learning_rate": 3.862667546885053e-06, "loss": 0.8041141, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.5649683475494385 }, { "auxiliary_loss_clip": 0.01208664, "auxiliary_loss_mlp": 0.01040224, "balance_loss_clip": 1.06342316, "balance_loss_mlp": 1.03009117, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 1.8263123138488657, "language_loss": 0.73672479, "learning_rate": 3.8623837300571965e-06, "loss": 0.75921375, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.573349714279175 }, { "auxiliary_loss_clip": 0.01236526, "auxiliary_loss_mlp": 0.01046693, "balance_loss_clip": 1.06831574, "balance_loss_mlp": 1.03651273, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 2.5398537289672287, "language_loss": 0.8397541, "learning_rate": 3.8620996307072085e-06, "loss": 0.86258626, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.5037333965301514 }, { "auxiliary_loss_clip": 0.01192445, "auxiliary_loss_mlp": 0.01038187, "balance_loss_clip": 1.05924892, "balance_loss_mlp": 1.0280242, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 1.8130490946862647, "language_loss": 0.64464813, "learning_rate": 3.861815248878188e-06, "loss": 0.6669544, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.573620319366455 }, { "auxiliary_loss_clip": 0.01198499, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.06632876, "balance_loss_mlp": 1.02128613, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 2.287243218526405, "language_loss": 0.79312414, "learning_rate": 3.861530584613274e-06, "loss": 0.81542361, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.524085760116577 }, { "auxiliary_loss_clip": 0.0121931, "auxiliary_loss_mlp": 0.0076427, "balance_loss_clip": 1.06680238, "balance_loss_mlp": 1.00008512, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 2.13414999522986, "language_loss": 0.82104164, "learning_rate": 3.86124563795565e-06, "loss": 0.84087741, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 2.518179178237915 }, { "auxiliary_loss_clip": 0.01235229, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.07143354, "balance_loss_mlp": 1.03541398, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 1.780015111174624, "language_loss": 0.70183754, "learning_rate": 3.860960408948543e-06, "loss": 0.72464436, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 3.2525079250335693 }, { "auxiliary_loss_clip": 0.01209535, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.06439734, "balance_loss_mlp": 1.03086925, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 2.505900079757993, "language_loss": 0.8986693, "learning_rate": 3.860674897635222e-06, "loss": 0.92117065, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.4681742191314697 }, { "auxiliary_loss_clip": 0.01218366, "auxiliary_loss_mlp": 0.01048691, "balance_loss_clip": 1.0658741, "balance_loss_mlp": 1.03795648, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 2.491793929092135, "language_loss": 0.8331666, "learning_rate": 3.860389104058998e-06, "loss": 0.85583723, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 2.4893887042999268 }, { "auxiliary_loss_clip": 0.01203695, "auxiliary_loss_mlp": 0.01041893, "balance_loss_clip": 1.06551313, "balance_loss_mlp": 1.03127742, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 2.4635722457433875, "language_loss": 0.72621191, "learning_rate": 3.860103028263227e-06, "loss": 0.74866772, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 2.5885047912597656 }, { "auxiliary_loss_clip": 0.01165983, "auxiliary_loss_mlp": 0.01042436, "balance_loss_clip": 1.05447531, "balance_loss_mlp": 1.03198743, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 2.0035692223552153, "language_loss": 0.70179909, "learning_rate": 3.859816670291304e-06, "loss": 0.72388327, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 3.4199001789093018 }, { "auxiliary_loss_clip": 0.01151335, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.05895269, "balance_loss_mlp": 1.03507257, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 2.4363594744974093, "language_loss": 0.9030624, "learning_rate": 3.859530030186672e-06, "loss": 0.92502797, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 3.3213143348693848 }, { "auxiliary_loss_clip": 0.0120925, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.06653953, "balance_loss_mlp": 1.02921176, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 126.12435513719466, "language_loss": 0.82851446, "learning_rate": 3.859243107992813e-06, "loss": 0.8510052, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 3.375338554382324 }, { "auxiliary_loss_clip": 0.01192572, "auxiliary_loss_mlp": 0.01040005, "balance_loss_clip": 1.0584774, "balance_loss_mlp": 1.02946711, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 2.3801636510324506, "language_loss": 0.7819373, "learning_rate": 3.858955903753252e-06, "loss": 0.80426306, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 2.7418296337127686 }, { "auxiliary_loss_clip": 0.01219585, "auxiliary_loss_mlp": 0.01037564, "balance_loss_clip": 1.06580448, "balance_loss_mlp": 1.02757978, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 1.6368415040399995, "language_loss": 0.8353225, "learning_rate": 3.858668417511559e-06, "loss": 0.857894, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.600917100906372 }, { "auxiliary_loss_clip": 0.01207971, "auxiliary_loss_mlp": 0.01037379, "balance_loss_clip": 1.06781805, "balance_loss_mlp": 1.02713943, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 2.008748524464274, "language_loss": 0.76309729, "learning_rate": 3.8583806493113445e-06, "loss": 0.78555083, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.532684087753296 }, { "auxiliary_loss_clip": 0.01217096, "auxiliary_loss_mlp": 0.01045283, "balance_loss_clip": 1.06718576, "balance_loss_mlp": 1.03492975, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 21.65130867512965, "language_loss": 0.82175773, "learning_rate": 3.858092599196263e-06, "loss": 0.84438151, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.520313024520874 }, { "auxiliary_loss_clip": 0.01214872, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.06513786, "balance_loss_mlp": 1.0298655, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 2.9650409227733205, "language_loss": 0.82650965, "learning_rate": 3.857804267210012e-06, "loss": 0.84906107, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.573158025741577 }, { "auxiliary_loss_clip": 0.01171219, "auxiliary_loss_mlp": 0.01037582, "balance_loss_clip": 1.05604887, "balance_loss_mlp": 1.02771163, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 1.9573344880507828, "language_loss": 0.88097692, "learning_rate": 3.857515653396331e-06, "loss": 0.90306491, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.564187526702881 }, { "auxiliary_loss_clip": 0.01174453, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.06007719, "balance_loss_mlp": 1.03162026, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 2.2131698639411095, "language_loss": 0.8671701, "learning_rate": 3.857226757799002e-06, "loss": 0.88932836, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.6035447120666504 }, { "auxiliary_loss_clip": 0.01202291, "auxiliary_loss_mlp": 0.01038263, "balance_loss_clip": 1.06191885, "balance_loss_mlp": 1.02787364, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 2.4199833198526, "language_loss": 0.74384409, "learning_rate": 3.85693758046185e-06, "loss": 0.76624966, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.591486692428589 }, { "auxiliary_loss_clip": 0.01235524, "auxiliary_loss_mlp": 0.0103081, "balance_loss_clip": 1.07165742, "balance_loss_mlp": 1.02157092, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 1.794290337170015, "language_loss": 0.8312875, "learning_rate": 3.8566481214287435e-06, "loss": 0.85395086, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.4958767890930176 }, { "auxiliary_loss_clip": 0.01177755, "auxiliary_loss_mlp": 0.01048504, "balance_loss_clip": 1.05671024, "balance_loss_mlp": 1.03927124, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 2.2192077802258328, "language_loss": 0.9022603, "learning_rate": 3.8563583807435935e-06, "loss": 0.92452288, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.5569844245910645 }, { "auxiliary_loss_clip": 0.01218102, "auxiliary_loss_mlp": 0.00763934, "balance_loss_clip": 1.065799, "balance_loss_mlp": 1.00011396, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 1.920996783308175, "language_loss": 0.77890736, "learning_rate": 3.856068358450353e-06, "loss": 0.79872775, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.5401666164398193 }, { "auxiliary_loss_clip": 0.01197051, "auxiliary_loss_mlp": 0.01046809, "balance_loss_clip": 1.06661689, "balance_loss_mlp": 1.03741515, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 1.8319963498551617, "language_loss": 0.85799837, "learning_rate": 3.8557780545930186e-06, "loss": 0.88043696, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.5340802669525146 }, { "auxiliary_loss_clip": 0.01201185, "auxiliary_loss_mlp": 0.01038438, "balance_loss_clip": 1.06662345, "balance_loss_mlp": 1.02823997, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 4.536479468467541, "language_loss": 0.79221082, "learning_rate": 3.855487469215628e-06, "loss": 0.81460696, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.5651161670684814 }, { "auxiliary_loss_clip": 0.01184046, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.06203723, "balance_loss_mlp": 1.02804315, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.3141789531318753, "language_loss": 0.72074735, "learning_rate": 3.855196602362264e-06, "loss": 0.74296331, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.7047815322875977 }, { "auxiliary_loss_clip": 0.0121672, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.06348538, "balance_loss_mlp": 1.02492046, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 2.759836005956032, "language_loss": 0.94438124, "learning_rate": 3.854905454077051e-06, "loss": 0.96689093, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.512117624282837 }, { "auxiliary_loss_clip": 0.01134758, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.05206752, "balance_loss_mlp": 1.02440667, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 1.799342135685345, "language_loss": 0.88395083, "learning_rate": 3.854614024404155e-06, "loss": 0.90563422, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.6659913063049316 }, { "auxiliary_loss_clip": 0.0118801, "auxiliary_loss_mlp": 0.01045039, "balance_loss_clip": 1.05992126, "balance_loss_mlp": 1.03572261, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 2.2246791207531884, "language_loss": 0.8939684, "learning_rate": 3.8543223133877865e-06, "loss": 0.91629887, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.5244293212890625 }, { "auxiliary_loss_clip": 0.01179603, "auxiliary_loss_mlp": 0.01046779, "balance_loss_clip": 1.0569551, "balance_loss_mlp": 1.03568697, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 1.9338246477101735, "language_loss": 0.88281071, "learning_rate": 3.854030321072198e-06, "loss": 0.90507454, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.5670957565307617 }, { "auxiliary_loss_clip": 0.01191532, "auxiliary_loss_mlp": 0.01044278, "balance_loss_clip": 1.06097507, "balance_loss_mlp": 1.03455615, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 1.9485664210385103, "language_loss": 0.73542875, "learning_rate": 3.853738047501682e-06, "loss": 0.75778687, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.622931480407715 }, { "auxiliary_loss_clip": 0.01220864, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.06830692, "balance_loss_mlp": 1.03018999, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 2.311163616947138, "language_loss": 0.77790856, "learning_rate": 3.85344549272058e-06, "loss": 0.80052209, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 3.325458288192749 }, { "auxiliary_loss_clip": 0.01213437, "auxiliary_loss_mlp": 0.01037293, "balance_loss_clip": 1.06554866, "balance_loss_mlp": 1.0273931, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.8156580564623166, "language_loss": 0.82844424, "learning_rate": 3.853152656773269e-06, "loss": 0.85095155, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 2.6338186264038086 }, { "auxiliary_loss_clip": 0.01199765, "auxiliary_loss_mlp": 0.01040952, "balance_loss_clip": 1.06367874, "balance_loss_mlp": 1.03128982, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 1.6569908625758358, "language_loss": 0.84640503, "learning_rate": 3.852859539704174e-06, "loss": 0.8688122, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 2.5596587657928467 }, { "auxiliary_loss_clip": 0.01168839, "auxiliary_loss_mlp": 0.01038059, "balance_loss_clip": 1.05789781, "balance_loss_mlp": 1.02797389, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 1.835476112114779, "language_loss": 0.7623288, "learning_rate": 3.85256614155776e-06, "loss": 0.78439778, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 3.413752317428589 }, { "auxiliary_loss_clip": 0.01213212, "auxiliary_loss_mlp": 0.01047164, "balance_loss_clip": 1.06097543, "balance_loss_mlp": 1.03741884, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 2.0492215053719023, "language_loss": 0.74554384, "learning_rate": 3.852272462378535e-06, "loss": 0.76814759, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 2.5034518241882324 }, { "auxiliary_loss_clip": 0.01199331, "auxiliary_loss_mlp": 0.01039098, "balance_loss_clip": 1.06175315, "balance_loss_mlp": 1.02981186, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 2.507181371958501, "language_loss": 0.77797937, "learning_rate": 3.85197850221105e-06, "loss": 0.80036366, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 3.9502391815185547 }, { "auxiliary_loss_clip": 0.01212757, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.06706762, "balance_loss_mlp": 1.02581203, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 2.3726623667044358, "language_loss": 0.75557154, "learning_rate": 3.851684261099899e-06, "loss": 0.77805191, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.6286158561706543 }, { "auxiliary_loss_clip": 0.01196867, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.06202912, "balance_loss_mlp": 1.02520132, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 2.06953269660528, "language_loss": 0.86732119, "learning_rate": 3.851389739089718e-06, "loss": 0.88963521, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.5257251262664795 }, { "auxiliary_loss_clip": 0.01218965, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.06917202, "balance_loss_mlp": 1.02779019, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 3.2010424294520945, "language_loss": 0.80093694, "learning_rate": 3.851094936225186e-06, "loss": 0.82350701, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.6059978008270264 }, { "auxiliary_loss_clip": 0.01197522, "auxiliary_loss_mlp": 0.01034969, "balance_loss_clip": 1.06480241, "balance_loss_mlp": 1.02514052, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 1.5221054215568754, "language_loss": 0.76697505, "learning_rate": 3.850799852551024e-06, "loss": 0.78929996, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.627321720123291 }, { "auxiliary_loss_clip": 0.01206191, "auxiliary_loss_mlp": 0.0104399, "balance_loss_clip": 1.06086278, "balance_loss_mlp": 1.03441155, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 2.2571696673926906, "language_loss": 0.85963368, "learning_rate": 3.850504488111995e-06, "loss": 0.88213551, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.5312087535858154 }, { "auxiliary_loss_clip": 0.01191656, "auxiliary_loss_mlp": 0.0103711, "balance_loss_clip": 1.05951965, "balance_loss_mlp": 1.02784169, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 1.8725446992473576, "language_loss": 0.82770312, "learning_rate": 3.850208842952907e-06, "loss": 0.84999079, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.6059553623199463 }, { "auxiliary_loss_clip": 0.0117614, "auxiliary_loss_mlp": 0.01034838, "balance_loss_clip": 1.05648148, "balance_loss_mlp": 1.02540874, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 1.8848417262908645, "language_loss": 0.79202056, "learning_rate": 3.849912917118608e-06, "loss": 0.81413037, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.6452410221099854 }, { "auxiliary_loss_clip": 0.01126418, "auxiliary_loss_mlp": 0.01037186, "balance_loss_clip": 1.03905678, "balance_loss_mlp": 1.03421807, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.8898160478092594, "language_loss": 0.59299958, "learning_rate": 3.849616710653992e-06, "loss": 0.61463559, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.123248815536499 }, { "auxiliary_loss_clip": 0.01215588, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.0654192, "balance_loss_mlp": 1.02740872, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.6391089731606816, "language_loss": 0.74911976, "learning_rate": 3.84932022360399e-06, "loss": 0.771649, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.525238275527954 }, { "auxiliary_loss_clip": 0.0119814, "auxiliary_loss_mlp": 0.01043897, "balance_loss_clip": 1.06833661, "balance_loss_mlp": 1.0336628, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 2.8070992715039202, "language_loss": 0.84425253, "learning_rate": 3.849023456013581e-06, "loss": 0.86667287, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.5650267601013184 }, { "auxiliary_loss_clip": 0.01219972, "auxiliary_loss_mlp": 0.01046886, "balance_loss_clip": 1.06662714, "balance_loss_mlp": 1.03693163, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 2.977613070945915, "language_loss": 0.62039173, "learning_rate": 3.848726407927784e-06, "loss": 0.64306027, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.590996742248535 }, { "auxiliary_loss_clip": 0.01204443, "auxiliary_loss_mlp": 0.01041312, "balance_loss_clip": 1.06819761, "balance_loss_mlp": 1.03060734, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 2.324249326525796, "language_loss": 0.86564147, "learning_rate": 3.84842907939166e-06, "loss": 0.88809907, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.5579047203063965 }, { "auxiliary_loss_clip": 0.01179238, "auxiliary_loss_mlp": 0.01043049, "balance_loss_clip": 1.06031156, "balance_loss_mlp": 1.03298795, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 2.6541669305571123, "language_loss": 0.71345329, "learning_rate": 3.8481314704503146e-06, "loss": 0.73567617, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.6265079975128174 }, { "auxiliary_loss_clip": 0.0121475, "auxiliary_loss_mlp": 0.01054902, "balance_loss_clip": 1.06889272, "balance_loss_mlp": 1.04505491, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.2504887181899695, "language_loss": 0.8792153, "learning_rate": 3.847833581148895e-06, "loss": 0.90191185, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.530487060546875 }, { "auxiliary_loss_clip": 0.01231308, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.066185, "balance_loss_mlp": 1.02330554, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 2.207361799962119, "language_loss": 0.80591643, "learning_rate": 3.84753541153259e-06, "loss": 0.8285622, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.589521646499634 }, { "auxiliary_loss_clip": 0.01217481, "auxiliary_loss_mlp": 0.01036359, "balance_loss_clip": 1.06803918, "balance_loss_mlp": 1.02639902, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.8820076689272485, "language_loss": 0.83314407, "learning_rate": 3.847236961646633e-06, "loss": 0.85568243, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.522284746170044 }, { "auxiliary_loss_clip": 0.01192732, "auxiliary_loss_mlp": 0.01050287, "balance_loss_clip": 1.06183481, "balance_loss_mlp": 1.04050541, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 2.2137828507661417, "language_loss": 0.7848689, "learning_rate": 3.846938231536296e-06, "loss": 0.80729902, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.553976535797119 }, { "auxiliary_loss_clip": 0.01222371, "auxiliary_loss_mlp": 0.01042765, "balance_loss_clip": 1.06843984, "balance_loss_mlp": 1.03256059, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 1.6220275859618785, "language_loss": 0.80676317, "learning_rate": 3.8466392212468995e-06, "loss": 0.82941461, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.5212409496307373 }, { "auxiliary_loss_clip": 0.01100385, "auxiliary_loss_mlp": 0.01014094, "balance_loss_clip": 1.03325748, "balance_loss_mlp": 1.01130486, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.8194345273003114, "language_loss": 0.61961508, "learning_rate": 3.8463399308238e-06, "loss": 0.64075989, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 3.143031597137451 }, { "auxiliary_loss_clip": 0.01215643, "auxiliary_loss_mlp": 0.01048239, "balance_loss_clip": 1.06812525, "balance_loss_mlp": 1.03834462, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 2.861348128336932, "language_loss": 0.63879347, "learning_rate": 3.846040360312402e-06, "loss": 0.66143233, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 3.336589813232422 }, { "auxiliary_loss_clip": 0.01229117, "auxiliary_loss_mlp": 0.01040311, "balance_loss_clip": 1.06516171, "balance_loss_mlp": 1.03022611, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 2.1042744236454625, "language_loss": 0.81230032, "learning_rate": 3.8457405097581485e-06, "loss": 0.83499455, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.5521767139434814 }, { "auxiliary_loss_clip": 0.01172032, "auxiliary_loss_mlp": 0.01046916, "balance_loss_clip": 1.05521655, "balance_loss_mlp": 1.03674769, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 1.91911724527818, "language_loss": 0.78157377, "learning_rate": 3.8454403792065275e-06, "loss": 0.80376327, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.60555362701416 }, { "auxiliary_loss_clip": 0.01175756, "auxiliary_loss_mlp": 0.01040163, "balance_loss_clip": 1.06129575, "balance_loss_mlp": 1.03008437, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 1.994365011889893, "language_loss": 0.85539615, "learning_rate": 3.845139968703068e-06, "loss": 0.87755537, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 2.594228506088257 }, { "auxiliary_loss_clip": 0.0117031, "auxiliary_loss_mlp": 0.01041981, "balance_loss_clip": 1.05805206, "balance_loss_mlp": 1.03159809, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 2.0885397677778905, "language_loss": 0.83116251, "learning_rate": 3.844839278293342e-06, "loss": 0.85328543, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 3.4069008827209473 }, { "auxiliary_loss_clip": 0.01235798, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.06972194, "balance_loss_mlp": 1.03290057, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 4.156160822937864, "language_loss": 0.76430798, "learning_rate": 3.8445383080229654e-06, "loss": 0.78710192, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 3.3065097332000732 }, { "auxiliary_loss_clip": 0.01192614, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.06062555, "balance_loss_mlp": 1.03634, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 3.6210443596574198, "language_loss": 0.73287261, "learning_rate": 3.844237057937593e-06, "loss": 0.75526476, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 3.4571926593780518 }, { "auxiliary_loss_clip": 0.01221118, "auxiliary_loss_mlp": 0.0104223, "balance_loss_clip": 1.06536674, "balance_loss_mlp": 1.03187037, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 2.1941191392480213, "language_loss": 0.77708435, "learning_rate": 3.843935528082926e-06, "loss": 0.7997179, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.5998668670654297 }, { "auxiliary_loss_clip": 0.01218437, "auxiliary_loss_mlp": 0.01036947, "balance_loss_clip": 1.06669927, "balance_loss_mlp": 1.02647495, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 1.9532299715171297, "language_loss": 0.84679854, "learning_rate": 3.843633718504704e-06, "loss": 0.86935234, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.5172414779663086 }, { "auxiliary_loss_clip": 0.01190148, "auxiliary_loss_mlp": 0.01055398, "balance_loss_clip": 1.0637176, "balance_loss_mlp": 1.045331, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 4.103450591303595, "language_loss": 0.8985815, "learning_rate": 3.843331629248715e-06, "loss": 0.92103702, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.593606472015381 }, { "auxiliary_loss_clip": 0.01232507, "auxiliary_loss_mlp": 0.01035289, "balance_loss_clip": 1.06970119, "balance_loss_mlp": 1.02572262, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 2.400743238880734, "language_loss": 0.76750219, "learning_rate": 3.843029260360782e-06, "loss": 0.79018021, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.5412042140960693 }, { "auxiliary_loss_clip": 0.01215027, "auxiliary_loss_mlp": 0.01050899, "balance_loss_clip": 1.06840062, "balance_loss_mlp": 1.04101062, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 1.8813892491913902, "language_loss": 0.78993058, "learning_rate": 3.8427266118867755e-06, "loss": 0.81258976, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.5872414112091064 }, { "auxiliary_loss_clip": 0.01201667, "auxiliary_loss_mlp": 0.01041177, "balance_loss_clip": 1.06608748, "balance_loss_mlp": 1.0305438, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 2.0130295149647757, "language_loss": 0.82632673, "learning_rate": 3.842423683872608e-06, "loss": 0.84875512, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.603322982788086 }, { "auxiliary_loss_clip": 0.01211349, "auxiliary_loss_mlp": 0.01041153, "balance_loss_clip": 1.062482, "balance_loss_mlp": 1.03077626, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 2.6238906278286493, "language_loss": 0.77468681, "learning_rate": 3.842120476364232e-06, "loss": 0.79721189, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.553133964538574 }, { "auxiliary_loss_clip": 0.01219452, "auxiliary_loss_mlp": 0.01040795, "balance_loss_clip": 1.06534529, "balance_loss_mlp": 1.03005993, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 2.0900399706312793, "language_loss": 0.83760548, "learning_rate": 3.841816989407644e-06, "loss": 0.86020797, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.4942965507507324 }, { "auxiliary_loss_clip": 0.01179704, "auxiliary_loss_mlp": 0.010428, "balance_loss_clip": 1.05924845, "balance_loss_mlp": 1.03321028, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 2.532464904028034, "language_loss": 0.76734936, "learning_rate": 3.841513223048884e-06, "loss": 0.78957444, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 2.7741076946258545 }, { "auxiliary_loss_clip": 0.01182079, "auxiliary_loss_mlp": 0.01042676, "balance_loss_clip": 1.06042171, "balance_loss_mlp": 1.03222156, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 2.4862515931981095, "language_loss": 0.78520322, "learning_rate": 3.841209177334031e-06, "loss": 0.80745077, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.615844249725342 }, { "auxiliary_loss_clip": 0.01213634, "auxiliary_loss_mlp": 0.01041002, "balance_loss_clip": 1.06682992, "balance_loss_mlp": 1.03094697, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 3.6544813138378918, "language_loss": 0.75005186, "learning_rate": 3.84090485230921e-06, "loss": 0.77259827, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.4972565174102783 }, { "auxiliary_loss_clip": 0.0122986, "auxiliary_loss_mlp": 0.01041908, "balance_loss_clip": 1.06781662, "balance_loss_mlp": 1.03176904, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 2.488696301385415, "language_loss": 0.77055323, "learning_rate": 3.840600248020588e-06, "loss": 0.79327095, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.443368911743164 }, { "auxiliary_loss_clip": 0.01205055, "auxiliary_loss_mlp": 0.01041244, "balance_loss_clip": 1.06155729, "balance_loss_mlp": 1.03005016, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 2.1202418620988226, "language_loss": 0.80228436, "learning_rate": 3.840295364514371e-06, "loss": 0.82474732, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.568103790283203 }, { "auxiliary_loss_clip": 0.01201451, "auxiliary_loss_mlp": 0.0104223, "balance_loss_clip": 1.06387055, "balance_loss_mlp": 1.0320859, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 2.3358260929144676, "language_loss": 0.79110163, "learning_rate": 3.83999020183681e-06, "loss": 0.81353843, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.5243980884552 }, { "auxiliary_loss_clip": 0.01148837, "auxiliary_loss_mlp": 0.01049997, "balance_loss_clip": 1.05503869, "balance_loss_mlp": 1.04004955, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 2.1372786039057843, "language_loss": 0.78644562, "learning_rate": 3.839684760034199e-06, "loss": 0.80843389, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.636122465133667 }, { "auxiliary_loss_clip": 0.011831, "auxiliary_loss_mlp": 0.01043773, "balance_loss_clip": 1.06483912, "balance_loss_mlp": 1.03377759, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 8.537811575941502, "language_loss": 0.65304923, "learning_rate": 3.8393790391528716e-06, "loss": 0.675318, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.6337616443634033 }, { "auxiliary_loss_clip": 0.01197015, "auxiliary_loss_mlp": 0.01038709, "balance_loss_clip": 1.06249475, "balance_loss_mlp": 1.0291667, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 2.1360249894860464, "language_loss": 0.8890475, "learning_rate": 3.8390730392392075e-06, "loss": 0.91140473, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.572516441345215 }, { "auxiliary_loss_clip": 0.01234842, "auxiliary_loss_mlp": 0.01043578, "balance_loss_clip": 1.06996894, "balance_loss_mlp": 1.03408897, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 2.329428949530667, "language_loss": 0.79137409, "learning_rate": 3.838766760339626e-06, "loss": 0.81415832, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.4679784774780273 }, { "auxiliary_loss_clip": 0.01163041, "auxiliary_loss_mlp": 0.01040395, "balance_loss_clip": 1.05738187, "balance_loss_mlp": 1.03034544, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 3.282836154026219, "language_loss": 0.79486585, "learning_rate": 3.838460202500587e-06, "loss": 0.81690013, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.5768418312072754 }, { "auxiliary_loss_clip": 0.01183733, "auxiliary_loss_mlp": 0.01043847, "balance_loss_clip": 1.06495571, "balance_loss_mlp": 1.03313053, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 2.439548213551908, "language_loss": 0.74038744, "learning_rate": 3.838153365768599e-06, "loss": 0.76266325, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 3.3573570251464844 }, { "auxiliary_loss_clip": 0.01185412, "auxiliary_loss_mlp": 0.01045768, "balance_loss_clip": 1.06610596, "balance_loss_mlp": 1.03465819, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 2.3262128462678335, "language_loss": 0.75381535, "learning_rate": 3.837846250190206e-06, "loss": 0.77612722, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 2.756483316421509 }, { "auxiliary_loss_clip": 0.01165865, "auxiliary_loss_mlp": 0.00762761, "balance_loss_clip": 1.0608511, "balance_loss_mlp": 1.00013447, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 2.7160933663248343, "language_loss": 0.77139312, "learning_rate": 3.837538855811998e-06, "loss": 0.79067945, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.599231719970703 }, { "auxiliary_loss_clip": 0.01203965, "auxiliary_loss_mlp": 0.01046513, "balance_loss_clip": 1.06472039, "balance_loss_mlp": 1.03618932, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.2043650484058883, "language_loss": 0.70995462, "learning_rate": 3.837231182680606e-06, "loss": 0.73245943, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 2.536276340484619 }, { "auxiliary_loss_clip": 0.01219528, "auxiliary_loss_mlp": 0.01045294, "balance_loss_clip": 1.06690717, "balance_loss_mlp": 1.03526258, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.617315180711769, "language_loss": 0.75989842, "learning_rate": 3.836923230842706e-06, "loss": 0.78254664, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 3.245407819747925 }, { "auxiliary_loss_clip": 0.01170933, "auxiliary_loss_mlp": 0.01036943, "balance_loss_clip": 1.05531359, "balance_loss_mlp": 1.02629209, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 2.523413466720098, "language_loss": 0.80409789, "learning_rate": 3.836615000345011e-06, "loss": 0.82617664, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 3.3726446628570557 }, { "auxiliary_loss_clip": 0.01230676, "auxiliary_loss_mlp": 0.01038058, "balance_loss_clip": 1.06833136, "balance_loss_mlp": 1.02874756, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 2.1211117119072984, "language_loss": 0.78068423, "learning_rate": 3.836306491234282e-06, "loss": 0.80337161, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 2.5048720836639404 }, { "auxiliary_loss_clip": 0.01193611, "auxiliary_loss_mlp": 0.01040075, "balance_loss_clip": 1.06492746, "balance_loss_mlp": 1.03024042, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.1679034055129125, "language_loss": 0.75242245, "learning_rate": 3.835997703557317e-06, "loss": 0.77475929, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 3.3387980461120605 }, { "auxiliary_loss_clip": 0.01170059, "auxiliary_loss_mlp": 0.0103422, "balance_loss_clip": 1.05455625, "balance_loss_mlp": 1.02437353, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 1.6294968758918935, "language_loss": 0.80057198, "learning_rate": 3.83568863736096e-06, "loss": 0.82261479, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.618720293045044 }, { "auxiliary_loss_clip": 0.01184921, "auxiliary_loss_mlp": 0.01038057, "balance_loss_clip": 1.05799699, "balance_loss_mlp": 1.02791262, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 2.84444293275084, "language_loss": 0.89225566, "learning_rate": 3.8353792926920975e-06, "loss": 0.91448545, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.6002206802368164 }, { "auxiliary_loss_clip": 0.01221339, "auxiliary_loss_mlp": 0.01040993, "balance_loss_clip": 1.06689525, "balance_loss_mlp": 1.03127706, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.09304826991198, "language_loss": 0.81856376, "learning_rate": 3.835069669597655e-06, "loss": 0.84118712, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.5164473056793213 }, { "auxiliary_loss_clip": 0.01220311, "auxiliary_loss_mlp": 0.00763862, "balance_loss_clip": 1.0657692, "balance_loss_mlp": 1.00015187, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 2.20911837898005, "language_loss": 0.7987113, "learning_rate": 3.834759768124603e-06, "loss": 0.81855309, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.536215305328369 }, { "auxiliary_loss_clip": 0.01188505, "auxiliary_loss_mlp": 0.01053361, "balance_loss_clip": 1.06431723, "balance_loss_mlp": 1.04371071, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 2.692213728442544, "language_loss": 0.76142442, "learning_rate": 3.834449588319953e-06, "loss": 0.7838431, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.5457448959350586 }, { "auxiliary_loss_clip": 0.01214599, "auxiliary_loss_mlp": 0.01051203, "balance_loss_clip": 1.07073689, "balance_loss_mlp": 1.04177356, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 1.983897442373497, "language_loss": 0.85384738, "learning_rate": 3.834139130230758e-06, "loss": 0.87650543, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.560774803161621 }, { "auxiliary_loss_clip": 0.01205264, "auxiliary_loss_mlp": 0.01035725, "balance_loss_clip": 1.06388581, "balance_loss_mlp": 1.02605665, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 1.9151240778559682, "language_loss": 0.81250143, "learning_rate": 3.833828393904117e-06, "loss": 0.83491135, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.574538469314575 }, { "auxiliary_loss_clip": 0.0116246, "auxiliary_loss_mlp": 0.01041192, "balance_loss_clip": 1.05579853, "balance_loss_mlp": 1.03192961, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.1007318563073385, "language_loss": 0.77383971, "learning_rate": 3.833517379387165e-06, "loss": 0.79587632, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.580143690109253 }, { "auxiliary_loss_clip": 0.01218174, "auxiliary_loss_mlp": 0.01041249, "balance_loss_clip": 1.06583679, "balance_loss_mlp": 1.03109288, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 2.1527981446876177, "language_loss": 0.88432157, "learning_rate": 3.833206086727085e-06, "loss": 0.90691584, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.5511558055877686 }, { "auxiliary_loss_clip": 0.01189806, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.05975461, "balance_loss_mlp": 1.02781224, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 2.470396706872111, "language_loss": 0.70315301, "learning_rate": 3.8328945159710994e-06, "loss": 0.72544152, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.6292171478271484 }, { "auxiliary_loss_clip": 0.01222725, "auxiliary_loss_mlp": 0.00762865, "balance_loss_clip": 1.0690763, "balance_loss_mlp": 1.00009453, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 2.069449191840747, "language_loss": 0.88701963, "learning_rate": 3.832582667166473e-06, "loss": 0.90687549, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.5276970863342285 }, { "auxiliary_loss_clip": 0.01193974, "auxiliary_loss_mlp": 0.01042706, "balance_loss_clip": 1.05924845, "balance_loss_mlp": 1.03233516, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 1.726045381206105, "language_loss": 0.81442517, "learning_rate": 3.8322705403605125e-06, "loss": 0.83679199, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.5789806842803955 }, { "auxiliary_loss_clip": 0.01193887, "auxiliary_loss_mlp": 0.01032264, "balance_loss_clip": 1.06300426, "balance_loss_mlp": 1.02313268, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 2.110106844901691, "language_loss": 0.81422758, "learning_rate": 3.831958135600568e-06, "loss": 0.83648908, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.5102357864379883 }, { "auxiliary_loss_clip": 0.01219121, "auxiliary_loss_mlp": 0.01037676, "balance_loss_clip": 1.06795502, "balance_loss_mlp": 1.02794814, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 2.1907115045527674, "language_loss": 0.79626924, "learning_rate": 3.831645452934032e-06, "loss": 0.81883717, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.5114235877990723 }, { "auxiliary_loss_clip": 0.0123353, "auxiliary_loss_mlp": 0.01044008, "balance_loss_clip": 1.07094073, "balance_loss_mlp": 1.03361285, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 1.930703644158957, "language_loss": 0.79624182, "learning_rate": 3.831332492408336e-06, "loss": 0.81901717, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.530386447906494 }, { "auxiliary_loss_clip": 0.01192508, "auxiliary_loss_mlp": 0.0103326, "balance_loss_clip": 1.05944371, "balance_loss_mlp": 1.02321696, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 2.090171120956722, "language_loss": 0.69312155, "learning_rate": 3.831019254070957e-06, "loss": 0.71537924, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.5457286834716797 }, { "auxiliary_loss_clip": 0.01177339, "auxiliary_loss_mlp": 0.01044842, "balance_loss_clip": 1.06170464, "balance_loss_mlp": 1.0343399, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 2.508343507941923, "language_loss": 0.94584858, "learning_rate": 3.8307057379694135e-06, "loss": 0.96807045, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.6478512287139893 }, { "auxiliary_loss_clip": 0.01230092, "auxiliary_loss_mlp": 0.01035423, "balance_loss_clip": 1.06705952, "balance_loss_mlp": 1.02571368, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 2.237001927363376, "language_loss": 0.82239091, "learning_rate": 3.830391944151264e-06, "loss": 0.84504604, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.4749934673309326 }, { "auxiliary_loss_clip": 0.01202089, "auxiliary_loss_mlp": 0.01044725, "balance_loss_clip": 1.06374323, "balance_loss_mlp": 1.03508735, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 2.0920697648997346, "language_loss": 0.67369407, "learning_rate": 3.830077872664114e-06, "loss": 0.69616222, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 3.351430654525757 }, { "auxiliary_loss_clip": 0.01156025, "auxiliary_loss_mlp": 0.01037423, "balance_loss_clip": 1.0551765, "balance_loss_mlp": 1.0287025, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 2.2602731416517305, "language_loss": 0.72778875, "learning_rate": 3.829763523555604e-06, "loss": 0.7497232, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 2.746156692504883 }, { "auxiliary_loss_clip": 0.01211726, "auxiliary_loss_mlp": 0.01042146, "balance_loss_clip": 1.06829, "balance_loss_mlp": 1.03321087, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 2.1978226262810248, "language_loss": 0.77960503, "learning_rate": 3.829448896873423e-06, "loss": 0.80214369, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.543355941772461 }, { "auxiliary_loss_clip": 0.01161398, "auxiliary_loss_mlp": 0.00763042, "balance_loss_clip": 1.06197929, "balance_loss_mlp": 1.00004923, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 1.9774672472308699, "language_loss": 0.79300916, "learning_rate": 3.829133992665299e-06, "loss": 0.81225359, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 2.6372323036193848 }, { "auxiliary_loss_clip": 0.01202681, "auxiliary_loss_mlp": 0.0103946, "balance_loss_clip": 1.06456494, "balance_loss_mlp": 1.02991772, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 6.130193027262617, "language_loss": 0.89184564, "learning_rate": 3.828818810979002e-06, "loss": 0.91426706, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 3.33537220954895 }, { "auxiliary_loss_clip": 0.01233636, "auxiliary_loss_mlp": 0.01042864, "balance_loss_clip": 1.07314038, "balance_loss_mlp": 1.03296947, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 1.9779696948097243, "language_loss": 0.80714428, "learning_rate": 3.8285033518623454e-06, "loss": 0.82990932, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 2.497628688812256 }, { "auxiliary_loss_clip": 0.01222992, "auxiliary_loss_mlp": 0.01039526, "balance_loss_clip": 1.06948614, "balance_loss_mlp": 1.02808166, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 5.656659443725247, "language_loss": 0.81327581, "learning_rate": 3.8281876153631845e-06, "loss": 0.83590102, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 3.2457497119903564 }, { "auxiliary_loss_clip": 0.01169409, "auxiliary_loss_mlp": 0.01037929, "balance_loss_clip": 1.05998826, "balance_loss_mlp": 1.02790999, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 2.008971357633955, "language_loss": 0.64669454, "learning_rate": 3.827871601529416e-06, "loss": 0.66876793, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 3.416466236114502 }, { "auxiliary_loss_clip": 0.01180867, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.06068659, "balance_loss_mlp": 1.02632165, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 2.051079970606802, "language_loss": 0.80634069, "learning_rate": 3.827555310408979e-06, "loss": 0.82850838, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.588622808456421 }, { "auxiliary_loss_clip": 0.01181442, "auxiliary_loss_mlp": 0.01047314, "balance_loss_clip": 1.06330812, "balance_loss_mlp": 1.03767598, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 1.7489067922827104, "language_loss": 0.82704294, "learning_rate": 3.827238742049854e-06, "loss": 0.84933054, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.6101632118225098 }, { "auxiliary_loss_clip": 0.01229951, "auxiliary_loss_mlp": 0.01034994, "balance_loss_clip": 1.06750011, "balance_loss_mlp": 1.02430129, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 1.9482494700290918, "language_loss": 0.51733297, "learning_rate": 3.826921896500066e-06, "loss": 0.53998244, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.550607442855835 }, { "auxiliary_loss_clip": 0.01194224, "auxiliary_loss_mlp": 0.01043919, "balance_loss_clip": 1.06681967, "balance_loss_mlp": 1.03434622, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 1.7985937082948331, "language_loss": 0.77981317, "learning_rate": 3.826604773807678e-06, "loss": 0.8021946, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.6059305667877197 }, { "auxiliary_loss_clip": 0.01197115, "auxiliary_loss_mlp": 0.01037668, "balance_loss_clip": 1.0593586, "balance_loss_mlp": 1.02573526, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.8525714598516805, "language_loss": 0.73728055, "learning_rate": 3.826287374020798e-06, "loss": 0.7596283, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.522134780883789 }, { "auxiliary_loss_clip": 0.01236157, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.0751996, "balance_loss_mlp": 1.02970254, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 2.008823828749455, "language_loss": 0.81773835, "learning_rate": 3.825969697187575e-06, "loss": 0.84049261, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.5200088024139404 }, { "auxiliary_loss_clip": 0.01182141, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.06124008, "balance_loss_mlp": 1.02618074, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 2.063328543063949, "language_loss": 0.69497037, "learning_rate": 3.8256517433562015e-06, "loss": 0.71714836, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.564527988433838 }, { "auxiliary_loss_clip": 0.01233773, "auxiliary_loss_mlp": 0.01037588, "balance_loss_clip": 1.07253051, "balance_loss_mlp": 1.0281111, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.393507431881445, "language_loss": 0.91502708, "learning_rate": 3.82533351257491e-06, "loss": 0.93774068, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.486743688583374 }, { "auxiliary_loss_clip": 0.0121362, "auxiliary_loss_mlp": 0.01040849, "balance_loss_clip": 1.06803739, "balance_loss_mlp": 1.03137827, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 2.2955308912138874, "language_loss": 0.88612533, "learning_rate": 3.825015004891975e-06, "loss": 0.90867007, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.535738706588745 }, { "auxiliary_loss_clip": 0.01212612, "auxiliary_loss_mlp": 0.01040292, "balance_loss_clip": 1.06620932, "balance_loss_mlp": 1.03020728, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 2.0319955450616844, "language_loss": 0.75792092, "learning_rate": 3.824696220355716e-06, "loss": 0.78044999, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.604069232940674 }, { "auxiliary_loss_clip": 0.011967, "auxiliary_loss_mlp": 0.01038253, "balance_loss_clip": 1.06500816, "balance_loss_mlp": 1.02729166, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 1.7281520048897914, "language_loss": 0.78808647, "learning_rate": 3.824377159014491e-06, "loss": 0.81043601, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.568821907043457 }, { "auxiliary_loss_clip": 0.0121396, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 1.06897533, "balance_loss_mlp": 1.02718782, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 2.6470223979668557, "language_loss": 0.8485254, "learning_rate": 3.824057820916702e-06, "loss": 0.87103373, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.530348300933838 }, { "auxiliary_loss_clip": 0.012055, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.06710005, "balance_loss_mlp": 1.02741456, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 2.48239093332264, "language_loss": 0.71339142, "learning_rate": 3.8237382061107904e-06, "loss": 0.73583233, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.512709856033325 }, { "auxiliary_loss_clip": 0.01134976, "auxiliary_loss_mlp": 0.01042467, "balance_loss_clip": 1.05209231, "balance_loss_mlp": 1.03269768, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 1.9174083408361586, "language_loss": 0.78846514, "learning_rate": 3.823418314645243e-06, "loss": 0.81023955, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.6816155910491943 }, { "auxiliary_loss_clip": 0.01152735, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.05838752, "balance_loss_mlp": 1.03182161, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 1.860664242697388, "language_loss": 0.75271475, "learning_rate": 3.823098146568588e-06, "loss": 0.77466357, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.610321521759033 }, { "auxiliary_loss_clip": 0.01219367, "auxiliary_loss_mlp": 0.01033486, "balance_loss_clip": 1.07055354, "balance_loss_mlp": 1.02396095, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 1.9681194401659374, "language_loss": 0.71415246, "learning_rate": 3.822777701929394e-06, "loss": 0.73668098, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.605142116546631 }, { "auxiliary_loss_clip": 0.01203818, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.06418276, "balance_loss_mlp": 1.02933645, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 2.0257914292597734, "language_loss": 0.73490328, "learning_rate": 3.8224569807762714e-06, "loss": 0.75733113, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.5629687309265137 }, { "auxiliary_loss_clip": 0.01155353, "auxiliary_loss_mlp": 0.01041541, "balance_loss_clip": 1.05754197, "balance_loss_mlp": 1.0309906, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 1.841128307339568, "language_loss": 0.76894319, "learning_rate": 3.822135983157873e-06, "loss": 0.79091215, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.631728410720825 }, { "auxiliary_loss_clip": 0.01228669, "auxiliary_loss_mlp": 0.00763173, "balance_loss_clip": 1.06843472, "balance_loss_mlp": 0.99998474, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 2.8256766366828723, "language_loss": 0.84146512, "learning_rate": 3.821814709122896e-06, "loss": 0.8613835, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 3.1904115676879883 }, { "auxiliary_loss_clip": 0.01202967, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.06746721, "balance_loss_mlp": 1.02846754, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.5364497980133702, "language_loss": 0.84602767, "learning_rate": 3.821493158720076e-06, "loss": 0.86842763, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 2.558246612548828 }, { "auxiliary_loss_clip": 0.01187181, "auxiliary_loss_mlp": 0.01041394, "balance_loss_clip": 1.06021988, "balance_loss_mlp": 1.03086209, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 3.6188432377477477, "language_loss": 0.73220468, "learning_rate": 3.821171331998191e-06, "loss": 0.75449049, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.5439980030059814 }, { "auxiliary_loss_clip": 0.01104498, "auxiliary_loss_mlp": 0.01006691, "balance_loss_clip": 1.04332137, "balance_loss_mlp": 1.00381851, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.7285667685570335, "language_loss": 0.54465449, "learning_rate": 3.820849229006064e-06, "loss": 0.56576633, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 3.3187265396118164 }, { "auxiliary_loss_clip": 0.0123205, "auxiliary_loss_mlp": 0.01035545, "balance_loss_clip": 1.06879032, "balance_loss_mlp": 1.02598405, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 1.950969683626627, "language_loss": 0.70634401, "learning_rate": 3.8205268497925564e-06, "loss": 0.72902, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 3.3106627464294434 }, { "auxiliary_loss_clip": 0.01232794, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.07206297, "balance_loss_mlp": 1.0278796, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 2.7349630469156225, "language_loss": 0.78718638, "learning_rate": 3.8202041944065725e-06, "loss": 0.80989391, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 2.450683116912842 }, { "auxiliary_loss_clip": 0.01238077, "auxiliary_loss_mlp": 0.01046928, "balance_loss_clip": 1.07602477, "balance_loss_mlp": 1.0375998, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 2.3459986007039837, "language_loss": 0.74048555, "learning_rate": 3.819881262897061e-06, "loss": 0.76333559, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 4.026902437210083 }, { "auxiliary_loss_clip": 0.01191363, "auxiliary_loss_mlp": 0.01044135, "balance_loss_clip": 1.06866765, "balance_loss_mlp": 1.03296554, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 1.8713317544338712, "language_loss": 0.73385125, "learning_rate": 3.819558055313008e-06, "loss": 0.75620627, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 2.6188266277313232 }, { "auxiliary_loss_clip": 0.0122441, "auxiliary_loss_mlp": 0.01041002, "balance_loss_clip": 1.06971025, "balance_loss_mlp": 1.0311079, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 2.2955405247983918, "language_loss": 0.77274656, "learning_rate": 3.819234571703444e-06, "loss": 0.79540068, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.5220553874969482 }, { "auxiliary_loss_clip": 0.01213524, "auxiliary_loss_mlp": 0.01045403, "balance_loss_clip": 1.06794071, "balance_loss_mlp": 1.0351156, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 1.9760769026162388, "language_loss": 0.85449624, "learning_rate": 3.8189108121174435e-06, "loss": 0.87708557, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.5534543991088867 }, { "auxiliary_loss_clip": 0.01180584, "auxiliary_loss_mlp": 0.01038018, "balance_loss_clip": 1.06514311, "balance_loss_mlp": 1.02801609, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 1.6475827997468981, "language_loss": 0.83675516, "learning_rate": 3.818586776604118e-06, "loss": 0.8589412, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.6812973022460938 }, { "auxiliary_loss_clip": 0.01196077, "auxiliary_loss_mlp": 0.01036354, "balance_loss_clip": 1.06434679, "balance_loss_mlp": 1.02703738, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 1.9199080452929496, "language_loss": 0.61486042, "learning_rate": 3.818262465212625e-06, "loss": 0.63718474, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.5714099407196045 }, { "auxiliary_loss_clip": 0.01205992, "auxiliary_loss_mlp": 0.01040656, "balance_loss_clip": 1.06646371, "balance_loss_mlp": 1.03024364, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 2.0167963035804672, "language_loss": 0.77255577, "learning_rate": 3.817937877992161e-06, "loss": 0.79502225, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.5057339668273926 }, { "auxiliary_loss_clip": 0.01186515, "auxiliary_loss_mlp": 0.00764316, "balance_loss_clip": 1.06198573, "balance_loss_mlp": 0.9999662, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 2.639821713956828, "language_loss": 0.85383457, "learning_rate": 3.817613014991967e-06, "loss": 0.87334287, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.5903990268707275 }, { "auxiliary_loss_clip": 0.01168818, "auxiliary_loss_mlp": 0.01036005, "balance_loss_clip": 1.05791318, "balance_loss_mlp": 1.02535939, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 1.9233166147113374, "language_loss": 0.76836491, "learning_rate": 3.817287876261323e-06, "loss": 0.79041314, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.6245508193969727 }, { "auxiliary_loss_clip": 0.01195315, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.06434369, "balance_loss_mlp": 1.02601254, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 1.796914650461569, "language_loss": 0.79921913, "learning_rate": 3.816962461849553e-06, "loss": 0.82153606, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.6151528358459473 }, { "auxiliary_loss_clip": 0.01188264, "auxiliary_loss_mlp": 0.01042251, "balance_loss_clip": 1.06190407, "balance_loss_mlp": 1.03234434, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 3.0583888120868163, "language_loss": 0.84372187, "learning_rate": 3.8166367718060235e-06, "loss": 0.866027, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.5495662689208984 }, { "auxiliary_loss_clip": 0.01213204, "auxiliary_loss_mlp": 0.01040896, "balance_loss_clip": 1.0660491, "balance_loss_mlp": 1.03107333, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 2.8791618512914714, "language_loss": 0.76428819, "learning_rate": 3.816310806180139e-06, "loss": 0.78682911, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.523298501968384 }, { "auxiliary_loss_clip": 0.01200883, "auxiliary_loss_mlp": 0.01041206, "balance_loss_clip": 1.06839991, "balance_loss_mlp": 1.03045356, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 1.6719689440458074, "language_loss": 0.81004012, "learning_rate": 3.81598456502135e-06, "loss": 0.832461, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.569723606109619 }, { "auxiliary_loss_clip": 0.01192728, "auxiliary_loss_mlp": 0.01045268, "balance_loss_clip": 1.06389415, "balance_loss_mlp": 1.03531373, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 1.9012235202621883, "language_loss": 0.87010586, "learning_rate": 3.8156580483791455e-06, "loss": 0.8924858, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.5237433910369873 }, { "auxiliary_loss_clip": 0.01232822, "auxiliary_loss_mlp": 0.01033333, "balance_loss_clip": 1.0712769, "balance_loss_mlp": 1.02312231, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 2.111529834887323, "language_loss": 0.76671976, "learning_rate": 3.815331256303059e-06, "loss": 0.78938133, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.534902811050415 }, { "auxiliary_loss_clip": 0.01182751, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.06605947, "balance_loss_mlp": 1.02364719, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 2.3042445604089665, "language_loss": 0.77210957, "learning_rate": 3.815004188842665e-06, "loss": 0.79428267, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.5863635540008545 }, { "auxiliary_loss_clip": 0.01188166, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.05729079, "balance_loss_mlp": 1.02362168, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.6598259044386634, "language_loss": 0.79773265, "learning_rate": 3.814676846047578e-06, "loss": 0.81994832, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.5926711559295654 }, { "auxiliary_loss_clip": 0.01213702, "auxiliary_loss_mlp": 0.01046916, "balance_loss_clip": 1.06744361, "balance_loss_mlp": 1.03601992, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 2.046966444360988, "language_loss": 0.69829565, "learning_rate": 3.8143492279674565e-06, "loss": 0.72090179, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.6342074871063232 }, { "auxiliary_loss_clip": 0.01089315, "auxiliary_loss_mlp": 0.01004929, "balance_loss_clip": 1.03356707, "balance_loss_mlp": 1.00216293, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 0.8360547672994669, "language_loss": 0.58430004, "learning_rate": 3.8140213346519997e-06, "loss": 0.60524249, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 2.907327651977539 }, { "auxiliary_loss_clip": 0.01173537, "auxiliary_loss_mlp": 0.01043055, "balance_loss_clip": 1.06206954, "balance_loss_mlp": 1.03239179, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 2.355750454497719, "language_loss": 0.76628548, "learning_rate": 3.813693166150948e-06, "loss": 0.78845143, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.6163647174835205 }, { "auxiliary_loss_clip": 0.011786, "auxiliary_loss_mlp": 0.01041505, "balance_loss_clip": 1.06383574, "balance_loss_mlp": 1.03030539, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 2.1338283625477037, "language_loss": 0.8558892, "learning_rate": 3.813364722514086e-06, "loss": 0.87809026, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 3.450256586074829 }, { "auxiliary_loss_clip": 0.01211754, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.06483424, "balance_loss_mlp": 1.01918602, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 1.9898887250239283, "language_loss": 0.80607873, "learning_rate": 3.8130360037912368e-06, "loss": 0.82848543, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 2.5063517093658447 }, { "auxiliary_loss_clip": 0.01210319, "auxiliary_loss_mlp": 0.010368, "balance_loss_clip": 1.0638181, "balance_loss_mlp": 1.02675653, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 2.3364352719887385, "language_loss": 0.82093334, "learning_rate": 3.812707010032268e-06, "loss": 0.84340453, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 2.5791122913360596 }, { "auxiliary_loss_clip": 0.0122277, "auxiliary_loss_mlp": 0.0103857, "balance_loss_clip": 1.07224357, "balance_loss_mlp": 1.02848446, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 2.163001124671667, "language_loss": 0.78942114, "learning_rate": 3.8123777412870863e-06, "loss": 0.81203449, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 2.5799107551574707 }, { "auxiliary_loss_clip": 0.01201885, "auxiliary_loss_mlp": 0.01044851, "balance_loss_clip": 1.06481838, "balance_loss_mlp": 1.0345875, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 2.2231679546362955, "language_loss": 0.78586143, "learning_rate": 3.812048197605643e-06, "loss": 0.80832887, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 3.335482358932495 }, { "auxiliary_loss_clip": 0.01212759, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.06426394, "balance_loss_mlp": 1.02401412, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 1.8586618209360404, "language_loss": 0.81092381, "learning_rate": 3.8117183790379277e-06, "loss": 0.83339268, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 3.218022346496582 }, { "auxiliary_loss_clip": 0.01225708, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.06643057, "balance_loss_mlp": 1.02845216, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 3.4881536802481157, "language_loss": 0.94165462, "learning_rate": 3.811388285633976e-06, "loss": 0.96429217, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 3.2937605381011963 }, { "auxiliary_loss_clip": 0.01173615, "auxiliary_loss_mlp": 0.01048514, "balance_loss_clip": 1.06234741, "balance_loss_mlp": 1.03769541, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 2.1623251655045603, "language_loss": 0.61982131, "learning_rate": 3.811057917443861e-06, "loss": 0.64204264, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 2.6993937492370605 }, { "auxiliary_loss_clip": 0.01106584, "auxiliary_loss_mlp": 0.01003996, "balance_loss_clip": 1.03566253, "balance_loss_mlp": 1.00109899, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.8805192363961902, "language_loss": 0.68247473, "learning_rate": 3.8107272745177e-06, "loss": 0.70358062, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.2831995487213135 }, { "auxiliary_loss_clip": 0.01183185, "auxiliary_loss_mlp": 0.01036473, "balance_loss_clip": 1.06116033, "balance_loss_mlp": 1.02668571, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 1.723534281310649, "language_loss": 0.78775525, "learning_rate": 3.8103963569056513e-06, "loss": 0.80995184, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.821563959121704 }, { "auxiliary_loss_clip": 0.01192875, "auxiliary_loss_mlp": 0.01037328, "balance_loss_clip": 1.05993772, "balance_loss_mlp": 1.02775562, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 1.9252926707613467, "language_loss": 0.87956989, "learning_rate": 3.8100651646579146e-06, "loss": 0.90187192, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.5722429752349854 }, { "auxiliary_loss_clip": 0.0118977, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.05921626, "balance_loss_mlp": 1.03294992, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.7233051793348424, "language_loss": 0.92862546, "learning_rate": 3.8097336978247317e-06, "loss": 0.95095187, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.5165750980377197 }, { "auxiliary_loss_clip": 0.01180917, "auxiliary_loss_mlp": 0.01044734, "balance_loss_clip": 1.05985045, "balance_loss_mlp": 1.03387976, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 1.9878376797238948, "language_loss": 0.88830066, "learning_rate": 3.8094019564563854e-06, "loss": 0.91055715, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.518465280532837 }, { "auxiliary_loss_clip": 0.01230153, "auxiliary_loss_mlp": 0.00763229, "balance_loss_clip": 1.0697999, "balance_loss_mlp": 0.99996352, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 2.392512846522975, "language_loss": 0.75050139, "learning_rate": 3.809069940603201e-06, "loss": 0.77043521, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.489375352859497 }, { "auxiliary_loss_clip": 0.01186049, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.06254506, "balance_loss_mlp": 1.02903914, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 2.101805838205608, "language_loss": 0.78010279, "learning_rate": 3.8087376503155452e-06, "loss": 0.80235493, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.539548635482788 }, { "auxiliary_loss_clip": 0.01103777, "auxiliary_loss_mlp": 0.01004752, "balance_loss_clip": 1.03467107, "balance_loss_mlp": 1.00186729, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.911958710379343, "language_loss": 0.56211305, "learning_rate": 3.808405085643826e-06, "loss": 0.58319831, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.1782608032226562 }, { "auxiliary_loss_clip": 0.01227515, "auxiliary_loss_mlp": 0.00762685, "balance_loss_clip": 1.06877446, "balance_loss_mlp": 0.99994051, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 2.1472831143931908, "language_loss": 0.88423258, "learning_rate": 3.8080722466384925e-06, "loss": 0.90413463, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.494722366333008 }, { "auxiliary_loss_clip": 0.01225343, "auxiliary_loss_mlp": 0.01041487, "balance_loss_clip": 1.06400013, "balance_loss_mlp": 1.03069901, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 2.359143671669053, "language_loss": 0.70961267, "learning_rate": 3.8077391333500376e-06, "loss": 0.73228097, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.5153942108154297 }, { "auxiliary_loss_clip": 0.01196558, "auxiliary_loss_mlp": 0.01038678, "balance_loss_clip": 1.06459522, "balance_loss_mlp": 1.02934992, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.8792113523093825, "language_loss": 0.76733249, "learning_rate": 3.8074057458289934e-06, "loss": 0.78968483, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.612356662750244 }, { "auxiliary_loss_clip": 0.01196415, "auxiliary_loss_mlp": 0.01035493, "balance_loss_clip": 1.06204987, "balance_loss_mlp": 1.02537775, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 2.0184832630058938, "language_loss": 0.82501101, "learning_rate": 3.807072084125934e-06, "loss": 0.84733009, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.5806500911712646 }, { "auxiliary_loss_clip": 0.01197956, "auxiliary_loss_mlp": 0.01038064, "balance_loss_clip": 1.06796527, "balance_loss_mlp": 1.02773404, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 3.2344479930191286, "language_loss": 0.80292916, "learning_rate": 3.806738148291477e-06, "loss": 0.82528937, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.5384366512298584 }, { "auxiliary_loss_clip": 0.0115533, "auxiliary_loss_mlp": 0.01045499, "balance_loss_clip": 1.05822897, "balance_loss_mlp": 1.03422201, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 2.51016694815606, "language_loss": 0.71281767, "learning_rate": 3.8064039383762793e-06, "loss": 0.73482597, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.7582147121429443 }, { "auxiliary_loss_clip": 0.01207961, "auxiliary_loss_mlp": 0.01043142, "balance_loss_clip": 1.06459939, "balance_loss_mlp": 1.03273535, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 2.2623424481934693, "language_loss": 0.76998663, "learning_rate": 3.8060694544310396e-06, "loss": 0.79249763, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.5419418811798096 }, { "auxiliary_loss_clip": 0.0122903, "auxiliary_loss_mlp": 0.01044588, "balance_loss_clip": 1.06801987, "balance_loss_mlp": 1.03409183, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 1.9733497488128657, "language_loss": 0.78361475, "learning_rate": 3.8057346965065006e-06, "loss": 0.80635095, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.5255582332611084 }, { "auxiliary_loss_clip": 0.01193287, "auxiliary_loss_mlp": 0.01041874, "balance_loss_clip": 1.06223631, "balance_loss_mlp": 1.03252172, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 1.7524224492814058, "language_loss": 0.84315723, "learning_rate": 3.805399664653443e-06, "loss": 0.86550879, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.649125099182129 }, { "auxiliary_loss_clip": 0.01229916, "auxiliary_loss_mlp": 0.01043868, "balance_loss_clip": 1.06738639, "balance_loss_mlp": 1.0341171, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 2.405026206722115, "language_loss": 0.7385056, "learning_rate": 3.805064358922692e-06, "loss": 0.76124346, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.519434690475464 }, { "auxiliary_loss_clip": 0.01212753, "auxiliary_loss_mlp": 0.01040363, "balance_loss_clip": 1.06323171, "balance_loss_mlp": 1.02902067, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 1.9702756158318395, "language_loss": 0.80901164, "learning_rate": 3.8047287793651136e-06, "loss": 0.83154279, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 3.3505115509033203 }, { "auxiliary_loss_clip": 0.01182287, "auxiliary_loss_mlp": 0.01039598, "balance_loss_clip": 1.06085384, "balance_loss_mlp": 1.02919066, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 1.8523649421610568, "language_loss": 0.88369763, "learning_rate": 3.8043929260316137e-06, "loss": 0.90591645, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 2.693598985671997 }, { "auxiliary_loss_clip": 0.01199735, "auxiliary_loss_mlp": 0.01047626, "balance_loss_clip": 1.06919909, "balance_loss_mlp": 1.03740335, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 1.9559252679283385, "language_loss": 0.83569205, "learning_rate": 3.8040567989731417e-06, "loss": 0.85816562, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 2.661323308944702 }, { "auxiliary_loss_clip": 0.01209435, "auxiliary_loss_mlp": 0.01037107, "balance_loss_clip": 1.06733918, "balance_loss_mlp": 1.02765405, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 3.143436812254544, "language_loss": 0.79576749, "learning_rate": 3.8037203982406876e-06, "loss": 0.81823289, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 2.622178077697754 }, { "auxiliary_loss_clip": 0.01228072, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.06787527, "balance_loss_mlp": 1.02993333, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 2.008846000897291, "language_loss": 0.72850817, "learning_rate": 3.8033837238852835e-06, "loss": 0.75119817, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 3.3236799240112305 }, { "auxiliary_loss_clip": 0.01185613, "auxiliary_loss_mlp": 0.01037655, "balance_loss_clip": 1.06066108, "balance_loss_mlp": 1.02784419, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.9786421529871654, "language_loss": 0.69486427, "learning_rate": 3.8030467759580017e-06, "loss": 0.71709692, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 2.6619603633880615 }, { "auxiliary_loss_clip": 0.01214097, "auxiliary_loss_mlp": 0.01043887, "balance_loss_clip": 1.06371641, "balance_loss_mlp": 1.03347969, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 1.9886428492664696, "language_loss": 0.86894417, "learning_rate": 3.802709554509958e-06, "loss": 0.89152408, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 4.297313928604126 }, { "auxiliary_loss_clip": 0.01196713, "auxiliary_loss_mlp": 0.01042633, "balance_loss_clip": 1.0615139, "balance_loss_mlp": 1.03290582, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 1.9223240280420086, "language_loss": 0.7925126, "learning_rate": 3.8023720595923083e-06, "loss": 0.814906, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 2.6128580570220947 }, { "auxiliary_loss_clip": 0.01161336, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.05613208, "balance_loss_mlp": 1.02602625, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 4.817149812004977, "language_loss": 0.87419111, "learning_rate": 3.80203429125625e-06, "loss": 0.89616549, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.7014243602752686 }, { "auxiliary_loss_clip": 0.01141797, "auxiliary_loss_mlp": 0.01040197, "balance_loss_clip": 1.05594945, "balance_loss_mlp": 1.03024912, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 1.7861258458219211, "language_loss": 0.70018935, "learning_rate": 3.8016962495530225e-06, "loss": 0.7220093, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.6691479682922363 }, { "auxiliary_loss_clip": 0.01228933, "auxiliary_loss_mlp": 0.01042224, "balance_loss_clip": 1.06729484, "balance_loss_mlp": 1.03200817, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.5689621608904236, "language_loss": 0.77089375, "learning_rate": 3.8013579345339063e-06, "loss": 0.79360533, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.454477310180664 }, { "auxiliary_loss_clip": 0.01184061, "auxiliary_loss_mlp": 0.01040563, "balance_loss_clip": 1.06234932, "balance_loss_mlp": 1.03009033, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 1.9988838673197138, "language_loss": 0.69129062, "learning_rate": 3.801019346250224e-06, "loss": 0.71353686, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.6108644008636475 }, { "auxiliary_loss_clip": 0.01208261, "auxiliary_loss_mlp": 0.01037853, "balance_loss_clip": 1.06395674, "balance_loss_mlp": 1.02800071, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 2.7759979757427606, "language_loss": 0.83822215, "learning_rate": 3.8006804847533395e-06, "loss": 0.8606832, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.540719747543335 }, { "auxiliary_loss_clip": 0.01226799, "auxiliary_loss_mlp": 0.01046693, "balance_loss_clip": 1.06707001, "balance_loss_mlp": 1.03673911, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 2.24411229795782, "language_loss": 0.85196137, "learning_rate": 3.8003413500946556e-06, "loss": 0.87469637, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.483126640319824 }, { "auxiliary_loss_clip": 0.01199506, "auxiliary_loss_mlp": 0.01046148, "balance_loss_clip": 1.06616879, "balance_loss_mlp": 1.03517509, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 4.14568690009498, "language_loss": 0.82694411, "learning_rate": 3.8000019423256216e-06, "loss": 0.84940064, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.537609577178955 }, { "auxiliary_loss_clip": 0.01183898, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.06177258, "balance_loss_mlp": 1.03285968, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 1.5755231158279994, "language_loss": 0.88008082, "learning_rate": 3.7996622614977234e-06, "loss": 0.90234393, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.595411777496338 }, { "auxiliary_loss_clip": 0.0119537, "auxiliary_loss_mlp": 0.01033706, "balance_loss_clip": 1.06551099, "balance_loss_mlp": 1.02402008, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 2.0098084269659267, "language_loss": 0.79330295, "learning_rate": 3.799322307662492e-06, "loss": 0.81559372, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.5505847930908203 }, { "auxiliary_loss_clip": 0.01169828, "auxiliary_loss_mlp": 0.01040654, "balance_loss_clip": 1.06118155, "balance_loss_mlp": 1.03031802, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 2.7093823340228087, "language_loss": 0.83474135, "learning_rate": 3.798982080871496e-06, "loss": 0.85684609, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.5773837566375732 }, { "auxiliary_loss_clip": 0.01227813, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.0677321, "balance_loss_mlp": 1.03511775, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 2.8513743344798113, "language_loss": 0.67986417, "learning_rate": 3.798641581176349e-06, "loss": 0.70259786, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.64139461517334 }, { "auxiliary_loss_clip": 0.01198402, "auxiliary_loss_mlp": 0.01035455, "balance_loss_clip": 1.06337047, "balance_loss_mlp": 1.02551329, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 2.1614962047542474, "language_loss": 0.74593604, "learning_rate": 3.7983008086287044e-06, "loss": 0.76827461, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.600578784942627 }, { "auxiliary_loss_clip": 0.01195622, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.06146479, "balance_loss_mlp": 1.02561367, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 5.680100536750502, "language_loss": 0.79291528, "learning_rate": 3.797959763280257e-06, "loss": 0.81523705, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.5528366565704346 }, { "auxiliary_loss_clip": 0.01216186, "auxiliary_loss_mlp": 0.01036789, "balance_loss_clip": 1.06760156, "balance_loss_mlp": 1.02672756, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 1.7953407374287453, "language_loss": 0.78755563, "learning_rate": 3.797618445182743e-06, "loss": 0.81008542, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.546335220336914 }, { "auxiliary_loss_clip": 0.01165839, "auxiliary_loss_mlp": 0.01039461, "balance_loss_clip": 1.05987632, "balance_loss_mlp": 1.02935767, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 2.2206166638963385, "language_loss": 0.84771717, "learning_rate": 3.79727685438794e-06, "loss": 0.86977017, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.6027839183807373 }, { "auxiliary_loss_clip": 0.01116823, "auxiliary_loss_mlp": 0.01008732, "balance_loss_clip": 1.0372237, "balance_loss_mlp": 1.00620449, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.8421112231301212, "language_loss": 0.61671001, "learning_rate": 3.796934990947667e-06, "loss": 0.63796556, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.1229758262634277 }, { "auxiliary_loss_clip": 0.01111701, "auxiliary_loss_mlp": 0.0100654, "balance_loss_clip": 1.03336358, "balance_loss_mlp": 1.00402486, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.8769125072509799, "language_loss": 0.62468582, "learning_rate": 3.7965928549137854e-06, "loss": 0.64586824, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 3.0394320487976074 }, { "auxiliary_loss_clip": 0.01187142, "auxiliary_loss_mlp": 0.01040268, "balance_loss_clip": 1.05840611, "balance_loss_mlp": 1.02977133, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 2.2320632333552575, "language_loss": 0.77782857, "learning_rate": 3.7962504463381953e-06, "loss": 0.80010265, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 2.663985013961792 }, { "auxiliary_loss_clip": 0.01189577, "auxiliary_loss_mlp": 0.00764498, "balance_loss_clip": 1.06431603, "balance_loss_mlp": 1.0000093, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 1.741666687432193, "language_loss": 0.79008847, "learning_rate": 3.7959077652728412e-06, "loss": 0.80962926, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 3.289870023727417 }, { "auxiliary_loss_clip": 0.01199181, "auxiliary_loss_mlp": 0.01045866, "balance_loss_clip": 1.0653764, "balance_loss_mlp": 1.03654361, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 2.0628873933921357, "language_loss": 0.77483153, "learning_rate": 3.795564811769707e-06, "loss": 0.79728198, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 2.5581743717193604 }, { "auxiliary_loss_clip": 0.01199855, "auxiliary_loss_mlp": 0.01044223, "balance_loss_clip": 1.06912422, "balance_loss_mlp": 1.03415573, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 1.9174463274387357, "language_loss": 0.77917755, "learning_rate": 3.795221585880818e-06, "loss": 0.80161828, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.6282262802124023 }, { "auxiliary_loss_clip": 0.01190792, "auxiliary_loss_mlp": 0.01039924, "balance_loss_clip": 1.07154202, "balance_loss_mlp": 1.03037548, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 2.031471953422813, "language_loss": 0.9127214, "learning_rate": 3.794878087658242e-06, "loss": 0.93502855, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 2.572491407394409 }, { "auxiliary_loss_clip": 0.01215496, "auxiliary_loss_mlp": 0.01043525, "balance_loss_clip": 1.0668931, "balance_loss_mlp": 1.03419054, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 2.01758154185329, "language_loss": 0.78472501, "learning_rate": 3.7945343171540873e-06, "loss": 0.80731523, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 3.3473005294799805 }, { "auxiliary_loss_clip": 0.01231829, "auxiliary_loss_mlp": 0.01046855, "balance_loss_clip": 1.07065499, "balance_loss_mlp": 1.03688312, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 5.495954222542481, "language_loss": 0.78793347, "learning_rate": 3.7941902744205033e-06, "loss": 0.81072032, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 2.5422165393829346 }, { "auxiliary_loss_clip": 0.01202271, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.06429887, "balance_loss_mlp": 1.02651787, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 2.1467656665767842, "language_loss": 0.83779061, "learning_rate": 3.7938459595096817e-06, "loss": 0.86018163, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 3.271543264389038 }, { "auxiliary_loss_clip": 0.0121914, "auxiliary_loss_mlp": 0.0104237, "balance_loss_clip": 1.06786442, "balance_loss_mlp": 1.03215337, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 1.9110132720155548, "language_loss": 0.8619169, "learning_rate": 3.7935013724738545e-06, "loss": 0.88453197, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 3.394890546798706 }, { "auxiliary_loss_clip": 0.0120722, "auxiliary_loss_mlp": 0.01046881, "balance_loss_clip": 1.06592834, "balance_loss_mlp": 1.03756452, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 1.9816725958863315, "language_loss": 0.77821112, "learning_rate": 3.7931565133652945e-06, "loss": 0.8007521, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.5568606853485107 }, { "auxiliary_loss_clip": 0.01227504, "auxiliary_loss_mlp": 0.01050115, "balance_loss_clip": 1.06755722, "balance_loss_mlp": 1.03978539, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.3128813217535398, "language_loss": 0.67811757, "learning_rate": 3.792811382236317e-06, "loss": 0.70089376, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.5200276374816895 }, { "auxiliary_loss_clip": 0.01215217, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.06557047, "balance_loss_mlp": 1.02783871, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 1.9478584090144946, "language_loss": 0.77962387, "learning_rate": 3.792465979139279e-06, "loss": 0.80215871, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.602949619293213 }, { "auxiliary_loss_clip": 0.01078014, "auxiliary_loss_mlp": 0.01044778, "balance_loss_clip": 1.02725673, "balance_loss_mlp": 1.04239368, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 0.9417768365819899, "language_loss": 0.6562466, "learning_rate": 3.792120304126576e-06, "loss": 0.67747456, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.192549467086792 }, { "auxiliary_loss_clip": 0.01138589, "auxiliary_loss_mlp": 0.01041514, "balance_loss_clip": 1.05443323, "balance_loss_mlp": 1.03136325, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 2.223884891384171, "language_loss": 0.83717597, "learning_rate": 3.791774357250649e-06, "loss": 0.85897702, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.700982093811035 }, { "auxiliary_loss_clip": 0.01196174, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.06511688, "balance_loss_mlp": 1.02865398, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 2.272785574779725, "language_loss": 0.7957477, "learning_rate": 3.7914281385639757e-06, "loss": 0.81810164, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.5358924865722656 }, { "auxiliary_loss_clip": 0.01210269, "auxiliary_loss_mlp": 0.01045633, "balance_loss_clip": 1.06204939, "balance_loss_mlp": 1.0359416, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 1.8628525507739992, "language_loss": 0.79627657, "learning_rate": 3.7910816481190784e-06, "loss": 0.81883556, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.542018413543701 }, { "auxiliary_loss_clip": 0.01185128, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.0599215, "balance_loss_mlp": 1.03061843, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 1.9639786658371834, "language_loss": 0.74936414, "learning_rate": 3.7907348859685193e-06, "loss": 0.77162248, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.6402735710144043 }, { "auxiliary_loss_clip": 0.01207436, "auxiliary_loss_mlp": 0.01038049, "balance_loss_clip": 1.06684279, "balance_loss_mlp": 1.02894175, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 2.0482864832365997, "language_loss": 0.80540937, "learning_rate": 3.790387852164902e-06, "loss": 0.82786429, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.5719830989837646 }, { "auxiliary_loss_clip": 0.01213251, "auxiliary_loss_mlp": 0.01038212, "balance_loss_clip": 1.06556153, "balance_loss_mlp": 1.02851439, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 1.7806796677299148, "language_loss": 0.76219273, "learning_rate": 3.7900405467608707e-06, "loss": 0.78470737, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.530714511871338 }, { "auxiliary_loss_clip": 0.01148633, "auxiliary_loss_mlp": 0.01051886, "balance_loss_clip": 1.05218899, "balance_loss_mlp": 1.04200363, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 5.613838741454218, "language_loss": 0.79602957, "learning_rate": 3.7896929698091114e-06, "loss": 0.81803477, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.569930076599121 }, { "auxiliary_loss_clip": 0.01231562, "auxiliary_loss_mlp": 0.01052747, "balance_loss_clip": 1.07121062, "balance_loss_mlp": 1.04247141, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 2.488771281509292, "language_loss": 0.67967069, "learning_rate": 3.7893451213623518e-06, "loss": 0.70251375, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.5314810276031494 }, { "auxiliary_loss_clip": 0.0121026, "auxiliary_loss_mlp": 0.00763969, "balance_loss_clip": 1.06560063, "balance_loss_mlp": 1.0000422, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 1.9683334380393174, "language_loss": 0.82259959, "learning_rate": 3.7889970014733606e-06, "loss": 0.84234184, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.5377919673919678 }, { "auxiliary_loss_clip": 0.01148116, "auxiliary_loss_mlp": 0.010392, "balance_loss_clip": 1.05430257, "balance_loss_mlp": 1.02907908, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.630740678661122, "language_loss": 0.77576542, "learning_rate": 3.7886486101949463e-06, "loss": 0.79763854, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.664477825164795 }, { "auxiliary_loss_clip": 0.01151081, "auxiliary_loss_mlp": 0.01040892, "balance_loss_clip": 1.05370426, "balance_loss_mlp": 1.03158832, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 1.99201395455247, "language_loss": 0.88168782, "learning_rate": 3.7882999475799594e-06, "loss": 0.90360755, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.6065685749053955 }, { "auxiliary_loss_clip": 0.01150664, "auxiliary_loss_mlp": 0.01041143, "balance_loss_clip": 1.05876219, "balance_loss_mlp": 1.03168416, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 1.8688816848506866, "language_loss": 0.81599987, "learning_rate": 3.787951013681293e-06, "loss": 0.83791792, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.6398072242736816 }, { "auxiliary_loss_clip": 0.01213628, "auxiliary_loss_mlp": 0.01047505, "balance_loss_clip": 1.06558347, "balance_loss_mlp": 1.03681207, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 2.2567568762751056, "language_loss": 0.77593368, "learning_rate": 3.787601808551879e-06, "loss": 0.798545, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.5401294231414795 }, { "auxiliary_loss_clip": 0.01184676, "auxiliary_loss_mlp": 0.01036502, "balance_loss_clip": 1.06264758, "balance_loss_mlp": 1.02643442, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 2.659630089582604, "language_loss": 0.83703017, "learning_rate": 3.7872523322446926e-06, "loss": 0.85924196, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.5807924270629883 }, { "auxiliary_loss_clip": 0.01168053, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.05512249, "balance_loss_mlp": 1.02874875, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 2.1873450348915333, "language_loss": 0.60415113, "learning_rate": 3.7869025848127478e-06, "loss": 0.62621498, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 3.58294677734375 }, { "auxiliary_loss_clip": 0.01211267, "auxiliary_loss_mlp": 0.01042263, "balance_loss_clip": 1.06326723, "balance_loss_mlp": 1.03261948, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 2.6802538720167814, "language_loss": 0.80866027, "learning_rate": 3.786552566309102e-06, "loss": 0.83119559, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.568075656890869 }, { "auxiliary_loss_clip": 0.0119242, "auxiliary_loss_mlp": 0.00763495, "balance_loss_clip": 1.06542492, "balance_loss_mlp": 1.00002778, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.3857299760790047, "language_loss": 0.86283922, "learning_rate": 3.7862022767868517e-06, "loss": 0.88239837, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.521491289138794 }, { "auxiliary_loss_clip": 0.01180519, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.06756842, "balance_loss_mlp": 1.02511358, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 2.017867728641486, "language_loss": 0.84379548, "learning_rate": 3.7858517162991367e-06, "loss": 0.86594701, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 2.6106793880462646 }, { "auxiliary_loss_clip": 0.01184088, "auxiliary_loss_mlp": 0.01050976, "balance_loss_clip": 1.06151223, "balance_loss_mlp": 1.04139709, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 2.5815007274916697, "language_loss": 0.60931665, "learning_rate": 3.7855008848991363e-06, "loss": 0.63166732, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 3.3537607192993164 }, { "auxiliary_loss_clip": 0.01196121, "auxiliary_loss_mlp": 0.01043746, "balance_loss_clip": 1.0669024, "balance_loss_mlp": 1.03482926, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 1.775737250947227, "language_loss": 0.77384877, "learning_rate": 3.7851497826400714e-06, "loss": 0.79624748, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 3.3626925945281982 }, { "auxiliary_loss_clip": 0.01227942, "auxiliary_loss_mlp": 0.0105996, "balance_loss_clip": 1.06911218, "balance_loss_mlp": 1.04846883, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 2.512445677498836, "language_loss": 0.76062685, "learning_rate": 3.7847984095752034e-06, "loss": 0.78350586, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 3.6212472915649414 }, { "auxiliary_loss_clip": 0.01224737, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.06602681, "balance_loss_mlp": 1.02526116, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 2.0480958816963857, "language_loss": 0.8033272, "learning_rate": 3.784446765757836e-06, "loss": 0.82591677, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 2.4838550090789795 }, { "auxiliary_loss_clip": 0.01164127, "auxiliary_loss_mlp": 0.01041891, "balance_loss_clip": 1.05817986, "balance_loss_mlp": 1.0311625, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 2.1671794210877744, "language_loss": 0.77732468, "learning_rate": 3.7840948512413133e-06, "loss": 0.79938483, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.629128932952881 }, { "auxiliary_loss_clip": 0.01176014, "auxiliary_loss_mlp": 0.01039123, "balance_loss_clip": 1.06337237, "balance_loss_mlp": 1.02888334, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 1.8242041563377056, "language_loss": 0.78671551, "learning_rate": 3.7837426660790196e-06, "loss": 0.80886686, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 2.7803046703338623 }, { "auxiliary_loss_clip": 0.01223971, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.0662154, "balance_loss_mlp": 1.02323723, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.506330515155809, "language_loss": 0.81767577, "learning_rate": 3.783390210324382e-06, "loss": 0.84023851, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.5097544193267822 }, { "auxiliary_loss_clip": 0.0117941, "auxiliary_loss_mlp": 0.01033982, "balance_loss_clip": 1.06398773, "balance_loss_mlp": 1.02508879, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 4.541607797428815, "language_loss": 0.72494256, "learning_rate": 3.7830374840308676e-06, "loss": 0.74707651, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.614821195602417 }, { "auxiliary_loss_clip": 0.01207825, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.06496644, "balance_loss_mlp": 1.02661264, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 2.774140771543342, "language_loss": 0.82623708, "learning_rate": 3.7826844872519842e-06, "loss": 0.84868038, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.5492103099823 }, { "auxiliary_loss_clip": 0.0119401, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.06554222, "balance_loss_mlp": 1.03137839, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 1.9781146423826275, "language_loss": 0.72576058, "learning_rate": 3.782331220041282e-06, "loss": 0.74810636, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.616131067276001 }, { "auxiliary_loss_clip": 0.01186352, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.06024098, "balance_loss_mlp": 1.02498531, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 2.2115048429092226, "language_loss": 0.82877642, "learning_rate": 3.7819776824523504e-06, "loss": 0.85099185, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.566915988922119 }, { "auxiliary_loss_clip": 0.01204441, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.06451309, "balance_loss_mlp": 1.03329134, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 2.020261316261261, "language_loss": 0.84018487, "learning_rate": 3.7816238745388213e-06, "loss": 0.86266065, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.6337151527404785 }, { "auxiliary_loss_clip": 0.01201901, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 1.06285667, "balance_loss_mlp": 1.02323329, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 1.8890123011359579, "language_loss": 0.86768591, "learning_rate": 3.781269796354367e-06, "loss": 0.89002919, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.590184211730957 }, { "auxiliary_loss_clip": 0.01200371, "auxiliary_loss_mlp": 0.01039979, "balance_loss_clip": 1.06680512, "balance_loss_mlp": 1.03076422, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 1.6644148923140722, "language_loss": 0.86085731, "learning_rate": 3.7809154479527006e-06, "loss": 0.88326085, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.5499353408813477 }, { "auxiliary_loss_clip": 0.01175723, "auxiliary_loss_mlp": 0.01040279, "balance_loss_clip": 1.06283247, "balance_loss_mlp": 1.03052735, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 2.1238055719626483, "language_loss": 0.84270471, "learning_rate": 3.780560829387577e-06, "loss": 0.86486471, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.562060832977295 }, { "auxiliary_loss_clip": 0.01103392, "auxiliary_loss_mlp": 0.01011353, "balance_loss_clip": 1.02628541, "balance_loss_mlp": 1.00893331, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.8618925605951729, "language_loss": 0.57929695, "learning_rate": 3.7802059407127915e-06, "loss": 0.60044432, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.153050184249878 }, { "auxiliary_loss_clip": 0.01187407, "auxiliary_loss_mlp": 0.01036181, "balance_loss_clip": 1.05984592, "balance_loss_mlp": 1.02588749, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.157152820883273, "language_loss": 0.86169446, "learning_rate": 3.7798507819821797e-06, "loss": 0.88393039, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.5706067085266113 }, { "auxiliary_loss_clip": 0.01172193, "auxiliary_loss_mlp": 0.01042413, "balance_loss_clip": 1.06073713, "balance_loss_mlp": 1.03213763, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 2.341536771038795, "language_loss": 0.78631604, "learning_rate": 3.7794953532496197e-06, "loss": 0.80846208, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.5720338821411133 }, { "auxiliary_loss_clip": 0.01064566, "auxiliary_loss_mlp": 0.00753602, "balance_loss_clip": 1.02635026, "balance_loss_mlp": 1.00001144, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.9194190154793992, "language_loss": 0.57952601, "learning_rate": 3.7791396545690295e-06, "loss": 0.59770769, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.215304374694824 }, { "auxiliary_loss_clip": 0.0121417, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.06852186, "balance_loss_mlp": 1.02654862, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 3.4612559885687415, "language_loss": 0.80816615, "learning_rate": 3.7787836859943685e-06, "loss": 0.83066988, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.550433397293091 }, { "auxiliary_loss_clip": 0.01209267, "auxiliary_loss_mlp": 0.01049246, "balance_loss_clip": 1.06373811, "balance_loss_mlp": 1.03942275, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 3.9915065294864065, "language_loss": 0.79063171, "learning_rate": 3.7784274475796363e-06, "loss": 0.81321687, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 2.540336847305298 }, { "auxiliary_loss_clip": 0.01177436, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.05815172, "balance_loss_mlp": 1.02304602, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 2.990498070170467, "language_loss": 0.76226056, "learning_rate": 3.7780709393788745e-06, "loss": 0.78437012, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.6271116733551025 }, { "auxiliary_loss_clip": 0.01222779, "auxiliary_loss_mlp": 0.01043078, "balance_loss_clip": 1.06526446, "balance_loss_mlp": 1.03414881, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 2.1725868379031446, "language_loss": 0.75394273, "learning_rate": 3.777714161446165e-06, "loss": 0.77660131, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 3.3371667861938477 }, { "auxiliary_loss_clip": 0.01212767, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.06754482, "balance_loss_mlp": 1.03155088, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 2.115346935327689, "language_loss": 0.69375223, "learning_rate": 3.7773571138356304e-06, "loss": 0.71629322, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.6712160110473633 }, { "auxiliary_loss_clip": 0.01152481, "auxiliary_loss_mlp": 0.01041919, "balance_loss_clip": 1.05960047, "balance_loss_mlp": 1.03297806, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.2057653189350854, "language_loss": 0.88907099, "learning_rate": 3.776999796601435e-06, "loss": 0.91101503, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 2.626495599746704 }, { "auxiliary_loss_clip": 0.01214531, "auxiliary_loss_mlp": 0.01038482, "balance_loss_clip": 1.06537962, "balance_loss_mlp": 1.02868295, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 1.89763792791142, "language_loss": 0.73025763, "learning_rate": 3.776642209797783e-06, "loss": 0.75278771, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 2.607489824295044 }, { "auxiliary_loss_clip": 0.01201611, "auxiliary_loss_mlp": 0.01053952, "balance_loss_clip": 1.06168199, "balance_loss_mlp": 1.04440284, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 2.693476577264495, "language_loss": 0.77957904, "learning_rate": 3.7762843534789205e-06, "loss": 0.80213463, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 3.2833545207977295 }, { "auxiliary_loss_clip": 0.01197829, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 1.0608567, "balance_loss_mlp": 1.03008747, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 2.2303346811322275, "language_loss": 0.87966663, "learning_rate": 3.7759262276991343e-06, "loss": 0.90204346, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 3.336195945739746 }, { "auxiliary_loss_clip": 0.01197545, "auxiliary_loss_mlp": 0.01042816, "balance_loss_clip": 1.06301951, "balance_loss_mlp": 1.03257012, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 2.166519149451295, "language_loss": 0.80644631, "learning_rate": 3.7755678325127506e-06, "loss": 0.82884991, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 2.52659010887146 }, { "auxiliary_loss_clip": 0.01158322, "auxiliary_loss_mlp": 0.01045372, "balance_loss_clip": 1.05849004, "balance_loss_mlp": 1.03544235, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 1.8503159986133724, "language_loss": 0.75788003, "learning_rate": 3.7752091679741393e-06, "loss": 0.779917, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 3.4348692893981934 }, { "auxiliary_loss_clip": 0.01207998, "auxiliary_loss_mlp": 0.01037485, "balance_loss_clip": 1.06570053, "balance_loss_mlp": 1.02799606, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 3.856463005557222, "language_loss": 0.77929425, "learning_rate": 3.774850234137708e-06, "loss": 0.80174911, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.57098126411438 }, { "auxiliary_loss_clip": 0.01205831, "auxiliary_loss_mlp": 0.01041965, "balance_loss_clip": 1.06184053, "balance_loss_mlp": 1.032094, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 3.3822472585350902, "language_loss": 0.83195019, "learning_rate": 3.7744910310579076e-06, "loss": 0.85442823, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.56130051612854 }, { "auxiliary_loss_clip": 0.01221547, "auxiliary_loss_mlp": 0.01043092, "balance_loss_clip": 1.06543684, "balance_loss_mlp": 1.03344762, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 1.8085056243548028, "language_loss": 0.84915358, "learning_rate": 3.774131558789229e-06, "loss": 0.87179995, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.483839988708496 }, { "auxiliary_loss_clip": 0.01223994, "auxiliary_loss_mlp": 0.00762632, "balance_loss_clip": 1.06625187, "balance_loss_mlp": 0.99996281, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 2.4074267337522506, "language_loss": 0.69889843, "learning_rate": 3.773771817386203e-06, "loss": 0.71876478, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.5520598888397217 }, { "auxiliary_loss_clip": 0.01190213, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.06000447, "balance_loss_mlp": 1.02501297, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 1.6288762733605522, "language_loss": 0.79136693, "learning_rate": 3.773411806903403e-06, "loss": 0.81361115, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.6342556476593018 }, { "auxiliary_loss_clip": 0.01149716, "auxiliary_loss_mlp": 0.01043557, "balance_loss_clip": 1.05514646, "balance_loss_mlp": 1.03369856, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 2.0195564743043395, "language_loss": 0.9458971, "learning_rate": 3.7730515273954415e-06, "loss": 0.96782982, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.645796298980713 }, { "auxiliary_loss_clip": 0.01224266, "auxiliary_loss_mlp": 0.01033903, "balance_loss_clip": 1.067029, "balance_loss_mlp": 1.02380037, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 1.8074744471311217, "language_loss": 0.84915018, "learning_rate": 3.772690978916973e-06, "loss": 0.87173188, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.535668134689331 }, { "auxiliary_loss_clip": 0.01210346, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.06784523, "balance_loss_mlp": 1.02982628, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 2.231442128606665, "language_loss": 0.868469, "learning_rate": 3.772330161522693e-06, "loss": 0.89097255, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.517756938934326 }, { "auxiliary_loss_clip": 0.01191931, "auxiliary_loss_mlp": 0.01037909, "balance_loss_clip": 1.0664463, "balance_loss_mlp": 1.02858138, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 2.1712225305019013, "language_loss": 0.79618561, "learning_rate": 3.7719690752673365e-06, "loss": 0.81848401, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.573653221130371 }, { "auxiliary_loss_clip": 0.0118418, "auxiliary_loss_mlp": 0.01049426, "balance_loss_clip": 1.06431782, "balance_loss_mlp": 1.03981161, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 1.9099062603064685, "language_loss": 0.78287435, "learning_rate": 3.7716077202056796e-06, "loss": 0.80521035, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.605098009109497 }, { "auxiliary_loss_clip": 0.01179062, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.06022799, "balance_loss_mlp": 1.03654206, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 2.1607969766930877, "language_loss": 0.93610096, "learning_rate": 3.7712460963925404e-06, "loss": 0.95836568, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.5133471488952637 }, { "auxiliary_loss_clip": 0.01185472, "auxiliary_loss_mlp": 0.01035773, "balance_loss_clip": 1.06016922, "balance_loss_mlp": 1.02681994, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 1.8168001452556795, "language_loss": 0.7536341, "learning_rate": 3.7708842038827775e-06, "loss": 0.7758466, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.5829224586486816 }, { "auxiliary_loss_clip": 0.01207396, "auxiliary_loss_mlp": 0.01050889, "balance_loss_clip": 1.06069052, "balance_loss_mlp": 1.04129839, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 1.7985117972124915, "language_loss": 0.85829532, "learning_rate": 3.770522042731288e-06, "loss": 0.88087809, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.554351329803467 }, { "auxiliary_loss_clip": 0.01158754, "auxiliary_loss_mlp": 0.01044088, "balance_loss_clip": 1.06052935, "balance_loss_mlp": 1.03391981, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 1.930896392562346, "language_loss": 0.87572682, "learning_rate": 3.7701596129930122e-06, "loss": 0.89775521, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.6225645542144775 }, { "auxiliary_loss_clip": 0.01188602, "auxiliary_loss_mlp": 0.01029268, "balance_loss_clip": 1.06381631, "balance_loss_mlp": 1.01934385, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 4.144401204839204, "language_loss": 0.73243254, "learning_rate": 3.7697969147229315e-06, "loss": 0.75461125, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.599379062652588 }, { "auxiliary_loss_clip": 0.01209953, "auxiliary_loss_mlp": 0.01043239, "balance_loss_clip": 1.06490481, "balance_loss_mlp": 1.03379166, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 1.969454641914461, "language_loss": 0.85120654, "learning_rate": 3.7694339479760647e-06, "loss": 0.87373853, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.514608144760132 }, { "auxiliary_loss_clip": 0.0108858, "auxiliary_loss_mlp": 0.01003784, "balance_loss_clip": 1.02506685, "balance_loss_mlp": 1.00150728, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 0.7716434244681164, "language_loss": 0.57329476, "learning_rate": 3.769070712807476e-06, "loss": 0.59421837, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 3.210659980773926 }, { "auxiliary_loss_clip": 0.0113849, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 1.0561254, "balance_loss_mlp": 1.0256933, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 8.409180134690091, "language_loss": 0.7832309, "learning_rate": 3.768707209272266e-06, "loss": 0.80497408, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.6351358890533447 }, { "auxiliary_loss_clip": 0.01192077, "auxiliary_loss_mlp": 0.01043779, "balance_loss_clip": 1.06321609, "balance_loss_mlp": 1.03396845, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.257416628677278, "language_loss": 0.76798677, "learning_rate": 3.768343437425579e-06, "loss": 0.79034531, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 3.2797653675079346 }, { "auxiliary_loss_clip": 0.01122288, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.05213952, "balance_loss_mlp": 1.02694941, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 2.976526158329585, "language_loss": 0.86206388, "learning_rate": 3.7679793973225987e-06, "loss": 0.88365334, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 2.7085530757904053 }, { "auxiliary_loss_clip": 0.01059131, "auxiliary_loss_mlp": 0.01007084, "balance_loss_clip": 1.02096677, "balance_loss_mlp": 1.004879, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8536866501646673, "language_loss": 0.61661541, "learning_rate": 3.767615089018549e-06, "loss": 0.6372776, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 3.2229766845703125 }, { "auxiliary_loss_clip": 0.01187858, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.06063616, "balance_loss_mlp": 1.02766371, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 2.0250824065102058, "language_loss": 0.86185533, "learning_rate": 3.7672505125686966e-06, "loss": 0.88411683, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 2.569427251815796 }, { "auxiliary_loss_clip": 0.01163784, "auxiliary_loss_mlp": 0.01041233, "balance_loss_clip": 1.05891323, "balance_loss_mlp": 1.03064132, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 3.9946685672686373, "language_loss": 0.84002924, "learning_rate": 3.7668856680283455e-06, "loss": 0.86207938, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 3.3340561389923096 }, { "auxiliary_loss_clip": 0.01200961, "auxiliary_loss_mlp": 0.01046974, "balance_loss_clip": 1.06473649, "balance_loss_mlp": 1.03644788, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 1.9180714958242873, "language_loss": 0.82524037, "learning_rate": 3.7665205554528437e-06, "loss": 0.84771967, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 3.227564573287964 }, { "auxiliary_loss_clip": 0.01195427, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.06666553, "balance_loss_mlp": 1.02470553, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 1.6546176147307523, "language_loss": 0.74496472, "learning_rate": 3.7661551748975782e-06, "loss": 0.76726246, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 3.391793966293335 }, { "auxiliary_loss_clip": 0.01088856, "auxiliary_loss_mlp": 0.01003506, "balance_loss_clip": 1.02463984, "balance_loss_mlp": 1.00107408, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.8156460389296397, "language_loss": 0.60500389, "learning_rate": 3.7657895264179772e-06, "loss": 0.62592751, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 3.1301932334899902 }, { "auxiliary_loss_clip": 0.01183661, "auxiliary_loss_mlp": 0.01033698, "balance_loss_clip": 1.05916739, "balance_loss_mlp": 1.02395225, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 1.8596296540972728, "language_loss": 0.74606764, "learning_rate": 3.765423610069509e-06, "loss": 0.76824117, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.74784255027771 }, { "auxiliary_loss_clip": 0.01195788, "auxiliary_loss_mlp": 0.01038822, "balance_loss_clip": 1.06475949, "balance_loss_mlp": 1.02884972, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 1.8851615574828986, "language_loss": 0.72109091, "learning_rate": 3.765057425907683e-06, "loss": 0.74343699, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.6937994956970215 }, { "auxiliary_loss_clip": 0.01211167, "auxiliary_loss_mlp": 0.01036532, "balance_loss_clip": 1.06312633, "balance_loss_mlp": 1.02674508, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 1.768929514658489, "language_loss": 0.78538179, "learning_rate": 3.764690973988048e-06, "loss": 0.80785882, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.5595650672912598 }, { "auxiliary_loss_clip": 0.01182481, "auxiliary_loss_mlp": 0.01042709, "balance_loss_clip": 1.06036294, "balance_loss_mlp": 1.0328207, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 1.8455794211593006, "language_loss": 0.74236906, "learning_rate": 3.7643242543661967e-06, "loss": 0.76462096, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.6198956966400146 }, { "auxiliary_loss_clip": 0.01077482, "auxiliary_loss_mlp": 0.01004015, "balance_loss_clip": 1.01978374, "balance_loss_mlp": 1.00165451, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.8141100265282946, "language_loss": 0.6047442, "learning_rate": 3.7639572670977573e-06, "loss": 0.62555915, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.034628391265869 }, { "auxiliary_loss_clip": 0.0118307, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.06088781, "balance_loss_mlp": 1.02371883, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 1.689231618986345, "language_loss": 0.76439929, "learning_rate": 3.7635900122384042e-06, "loss": 0.78655457, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.633913040161133 }, { "auxiliary_loss_clip": 0.0119277, "auxiliary_loss_mlp": 0.01039108, "balance_loss_clip": 1.06035399, "balance_loss_mlp": 1.02766371, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 2.11101222314304, "language_loss": 0.86849564, "learning_rate": 3.7632224898438477e-06, "loss": 0.89081442, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.5246124267578125 }, { "auxiliary_loss_clip": 0.01183613, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.06146169, "balance_loss_mlp": 1.02763009, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 1.6838685803242521, "language_loss": 0.79145873, "learning_rate": 3.762854699969842e-06, "loss": 0.81367123, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.6028246879577637 }, { "auxiliary_loss_clip": 0.01204483, "auxiliary_loss_mlp": 0.01034155, "balance_loss_clip": 1.06427622, "balance_loss_mlp": 1.02461851, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 1.9062881956836504, "language_loss": 0.7312277, "learning_rate": 3.762486642672179e-06, "loss": 0.75361407, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.5115625858306885 }, { "auxiliary_loss_clip": 0.01197231, "auxiliary_loss_mlp": 0.01040359, "balance_loss_clip": 1.06478834, "balance_loss_mlp": 1.03078651, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 2.102554337710655, "language_loss": 0.86775666, "learning_rate": 3.7621183180066946e-06, "loss": 0.89013255, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.523895025253296 }, { "auxiliary_loss_clip": 0.01189582, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.05765152, "balance_loss_mlp": 1.02834129, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.968434127860691, "language_loss": 0.73876262, "learning_rate": 3.7617497260292625e-06, "loss": 0.76104206, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.6273505687713623 }, { "auxiliary_loss_clip": 0.0118839, "auxiliary_loss_mlp": 0.01037132, "balance_loss_clip": 1.06220722, "balance_loss_mlp": 1.02676654, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 3.349364183620209, "language_loss": 0.78685212, "learning_rate": 3.7613808667957967e-06, "loss": 0.8091073, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.5250847339630127 }, { "auxiliary_loss_clip": 0.01193989, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.0603497, "balance_loss_mlp": 1.03366995, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 1.974432659882828, "language_loss": 0.90683413, "learning_rate": 3.7610117403622547e-06, "loss": 0.92920893, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.523420572280884 }, { "auxiliary_loss_clip": 0.01174113, "auxiliary_loss_mlp": 0.01040583, "balance_loss_clip": 1.05876625, "balance_loss_mlp": 1.0306468, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.6944604763319746, "language_loss": 0.90227997, "learning_rate": 3.7606423467846313e-06, "loss": 0.92442691, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.6038784980773926 }, { "auxiliary_loss_clip": 0.01188438, "auxiliary_loss_mlp": 0.01050841, "balance_loss_clip": 1.06547582, "balance_loss_mlp": 1.04107177, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 1.4754682258010787, "language_loss": 0.79688585, "learning_rate": 3.760272686118964e-06, "loss": 0.8192786, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.5905654430389404 }, { "auxiliary_loss_clip": 0.01199473, "auxiliary_loss_mlp": 0.01033783, "balance_loss_clip": 1.06486034, "balance_loss_mlp": 1.02400219, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 2.119909026274675, "language_loss": 0.92671323, "learning_rate": 3.7599027584213297e-06, "loss": 0.94904578, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.569788932800293 }, { "auxiliary_loss_clip": 0.01211728, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.06329942, "balance_loss_mlp": 1.02660036, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 2.1363640443559104, "language_loss": 0.78239214, "learning_rate": 3.7595325637478465e-06, "loss": 0.80487525, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 2.509934425354004 }, { "auxiliary_loss_clip": 0.01183247, "auxiliary_loss_mlp": 0.01041711, "balance_loss_clip": 1.06197846, "balance_loss_mlp": 1.03139985, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 1.897685973617354, "language_loss": 0.81743228, "learning_rate": 3.7591621021546723e-06, "loss": 0.83968186, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.5924739837646484 }, { "auxiliary_loss_clip": 0.01203565, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.06330025, "balance_loss_mlp": 1.02834344, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 1.8704142237463572, "language_loss": 0.81470537, "learning_rate": 3.7587913736980062e-06, "loss": 0.83712602, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 3.221287727355957 }, { "auxiliary_loss_clip": 0.01129305, "auxiliary_loss_mlp": 0.01047868, "balance_loss_clip": 1.05044973, "balance_loss_mlp": 1.03841496, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 1.6767655551456115, "language_loss": 0.84322566, "learning_rate": 3.7584203784340865e-06, "loss": 0.86499739, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.6494548320770264 }, { "auxiliary_loss_clip": 0.01187608, "auxiliary_loss_mlp": 0.01035737, "balance_loss_clip": 1.0586679, "balance_loss_mlp": 1.02618194, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 2.5006777771099045, "language_loss": 0.85549092, "learning_rate": 3.7580491164191938e-06, "loss": 0.87772441, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 2.5863144397735596 }, { "auxiliary_loss_clip": 0.01099542, "auxiliary_loss_mlp": 0.01005181, "balance_loss_clip": 1.023561, "balance_loss_mlp": 1.00288057, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.7490254039796733, "language_loss": 0.61291361, "learning_rate": 3.757677587709648e-06, "loss": 0.63396084, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 3.9287827014923096 }, { "auxiliary_loss_clip": 0.01175726, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.06310415, "balance_loss_mlp": 1.02561557, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 1.901934923493248, "language_loss": 0.75813723, "learning_rate": 3.7573057923618095e-06, "loss": 0.78024685, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 2.614283323287964 }, { "auxiliary_loss_clip": 0.01159313, "auxiliary_loss_mlp": 0.01037457, "balance_loss_clip": 1.05477118, "balance_loss_mlp": 1.02697217, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 3.0487601404800104, "language_loss": 0.7423656, "learning_rate": 3.7569337304320793e-06, "loss": 0.76433331, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 3.3069894313812256 }, { "auxiliary_loss_clip": 0.0108509, "auxiliary_loss_mlp": 0.01002326, "balance_loss_clip": 1.0211314, "balance_loss_mlp": 0.99998969, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 0.8470666919323194, "language_loss": 0.64457798, "learning_rate": 3.756561401976899e-06, "loss": 0.66545212, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 3.020946979522705 }, { "auxiliary_loss_clip": 0.01225184, "auxiliary_loss_mlp": 0.01037763, "balance_loss_clip": 1.06730294, "balance_loss_mlp": 1.02790415, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 1.7752049267531378, "language_loss": 0.82390738, "learning_rate": 3.7561888070527514e-06, "loss": 0.84653687, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 3.3986682891845703 }, { "auxiliary_loss_clip": 0.01161928, "auxiliary_loss_mlp": 0.00763049, "balance_loss_clip": 1.05631495, "balance_loss_mlp": 0.99995065, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.2375225032963004, "language_loss": 0.79808092, "learning_rate": 3.7558159457161577e-06, "loss": 0.81733072, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 2.5843939781188965 }, { "auxiliary_loss_clip": 0.01191487, "auxiliary_loss_mlp": 0.00762893, "balance_loss_clip": 1.06011915, "balance_loss_mlp": 0.9999702, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 2.3630891632324933, "language_loss": 0.77532244, "learning_rate": 3.755442818023681e-06, "loss": 0.7948662, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.5639564990997314 }, { "auxiliary_loss_clip": 0.01178188, "auxiliary_loss_mlp": 0.0103062, "balance_loss_clip": 1.06029916, "balance_loss_mlp": 1.02135181, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 1.9760335825044806, "language_loss": 0.76010692, "learning_rate": 3.7550694240319246e-06, "loss": 0.78219497, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.5595877170562744 }, { "auxiliary_loss_clip": 0.01212728, "auxiliary_loss_mlp": 0.01041286, "balance_loss_clip": 1.06418991, "balance_loss_mlp": 1.03177357, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 2.4658305033267327, "language_loss": 0.77134097, "learning_rate": 3.7546957637975326e-06, "loss": 0.79388118, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.5183582305908203 }, { "auxiliary_loss_clip": 0.01137556, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.04933047, "balance_loss_mlp": 1.02677941, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 1.5505820138035824, "language_loss": 0.74185795, "learning_rate": 3.7543218373771873e-06, "loss": 0.76358962, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.6275808811187744 }, { "auxiliary_loss_clip": 0.0113907, "auxiliary_loss_mlp": 0.00762731, "balance_loss_clip": 1.05277157, "balance_loss_mlp": 0.99995345, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.413192405025168, "language_loss": 0.77757478, "learning_rate": 3.753947644827615e-06, "loss": 0.79659283, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.773341178894043 }, { "auxiliary_loss_clip": 0.01084548, "auxiliary_loss_mlp": 0.01012139, "balance_loss_clip": 1.01953483, "balance_loss_mlp": 1.00973129, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.9394750042745883, "language_loss": 0.57167655, "learning_rate": 3.753573186205579e-06, "loss": 0.59264344, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.288184642791748 }, { "auxiliary_loss_clip": 0.01177926, "auxiliary_loss_mlp": 0.00762962, "balance_loss_clip": 1.05674195, "balance_loss_mlp": 0.99995553, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.137564284253662, "language_loss": 0.7758376, "learning_rate": 3.753198461567885e-06, "loss": 0.79524648, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.536231517791748 }, { "auxiliary_loss_clip": 0.01170512, "auxiliary_loss_mlp": 0.01035419, "balance_loss_clip": 1.06242597, "balance_loss_mlp": 1.0265801, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 1.8125412293795107, "language_loss": 0.91884267, "learning_rate": 3.7528234709713783e-06, "loss": 0.94090199, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.633136749267578 }, { "auxiliary_loss_clip": 0.01211803, "auxiliary_loss_mlp": 0.01049627, "balance_loss_clip": 1.06628299, "balance_loss_mlp": 1.04032278, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 1.9768859262158636, "language_loss": 0.84305435, "learning_rate": 3.7524482144729447e-06, "loss": 0.86566865, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.5639114379882812 }, { "auxiliary_loss_clip": 0.01169858, "auxiliary_loss_mlp": 0.01037198, "balance_loss_clip": 1.05620933, "balance_loss_mlp": 1.02676129, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 2.162070760397459, "language_loss": 0.83690727, "learning_rate": 3.7520726921295106e-06, "loss": 0.85897779, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.5539703369140625 }, { "auxiliary_loss_clip": 0.01200362, "auxiliary_loss_mlp": 0.01045691, "balance_loss_clip": 1.05969083, "balance_loss_mlp": 1.0360235, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 2.0135258610314253, "language_loss": 0.7261343, "learning_rate": 3.751696903998042e-06, "loss": 0.74859488, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.545659065246582 }, { "auxiliary_loss_clip": 0.01205326, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.06652558, "balance_loss_mlp": 1.02792168, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 3.4351690661654124, "language_loss": 0.69872129, "learning_rate": 3.7513208501355456e-06, "loss": 0.72114515, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.581122398376465 }, { "auxiliary_loss_clip": 0.01191431, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.06305015, "balance_loss_mlp": 1.0249635, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 2.532714989420842, "language_loss": 0.83470249, "learning_rate": 3.750944530599069e-06, "loss": 0.85695809, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.592742443084717 }, { "auxiliary_loss_clip": 0.01214324, "auxiliary_loss_mlp": 0.01032609, "balance_loss_clip": 1.06543922, "balance_loss_mlp": 1.02312613, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 2.1361612923652844, "language_loss": 0.80833191, "learning_rate": 3.7505679454456992e-06, "loss": 0.83080125, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.510756015777588 }, { "auxiliary_loss_clip": 0.0112536, "auxiliary_loss_mlp": 0.01037643, "balance_loss_clip": 1.05226469, "balance_loss_mlp": 1.02854133, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 2.0448043156578812, "language_loss": 0.70070255, "learning_rate": 3.750191094732564e-06, "loss": 0.72233248, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 2.7696404457092285 }, { "auxiliary_loss_clip": 0.01122448, "auxiliary_loss_mlp": 0.00763267, "balance_loss_clip": 1.04884839, "balance_loss_mlp": 0.9999705, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 1.7003317984376691, "language_loss": 0.75442553, "learning_rate": 3.7498139785168313e-06, "loss": 0.77328265, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 2.8635051250457764 }, { "auxiliary_loss_clip": 0.01205898, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.06422997, "balance_loss_mlp": 1.02762485, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 1.7207088838599416, "language_loss": 0.77259612, "learning_rate": 3.749436596855709e-06, "loss": 0.79502153, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 2.5355703830718994 }, { "auxiliary_loss_clip": 0.0119779, "auxiliary_loss_mlp": 0.0104666, "balance_loss_clip": 1.06027389, "balance_loss_mlp": 1.0356096, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 1.9263211145734314, "language_loss": 0.90676194, "learning_rate": 3.749058949806446e-06, "loss": 0.92920649, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 3.3030893802642822 }, { "auxiliary_loss_clip": 0.01209049, "auxiliary_loss_mlp": 0.01042724, "balance_loss_clip": 1.06452656, "balance_loss_mlp": 1.0334909, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.6321679991220814, "language_loss": 0.84358507, "learning_rate": 3.748681037426331e-06, "loss": 0.86610281, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.5320985317230225 }, { "auxiliary_loss_clip": 0.01223667, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.0667634, "balance_loss_mlp": 1.03407145, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.1290873349337214, "language_loss": 0.91829687, "learning_rate": 3.7483028597726936e-06, "loss": 0.94097227, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 2.4589030742645264 }, { "auxiliary_loss_clip": 0.01176953, "auxiliary_loss_mlp": 0.01050509, "balance_loss_clip": 1.06197023, "balance_loss_mlp": 1.03953612, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 1.7638450267452004, "language_loss": 0.62522751, "learning_rate": 3.7479244169029017e-06, "loss": 0.64750218, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 3.5767829418182373 }, { "auxiliary_loss_clip": 0.01206272, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.05982089, "balance_loss_mlp": 1.02143741, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 2.576689265557877, "language_loss": 0.73838198, "learning_rate": 3.7475457088743658e-06, "loss": 0.76075578, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 2.511605739593506 }, { "auxiliary_loss_clip": 0.01181872, "auxiliary_loss_mlp": 0.0103614, "balance_loss_clip": 1.05984354, "balance_loss_mlp": 1.02638245, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 1.9737628870263553, "language_loss": 0.74543142, "learning_rate": 3.7471667357445348e-06, "loss": 0.7676115, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 3.3819191455841064 }, { "auxiliary_loss_clip": 0.01144858, "auxiliary_loss_mlp": 0.01032203, "balance_loss_clip": 1.05571735, "balance_loss_mlp": 1.02294624, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 2.105336051580151, "language_loss": 0.72356141, "learning_rate": 3.7467874975709e-06, "loss": 0.745332, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 3.59055757522583 }, { "auxiliary_loss_clip": 0.01213722, "auxiliary_loss_mlp": 0.01038859, "balance_loss_clip": 1.06765676, "balance_loss_mlp": 1.02947164, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 2.316404847050491, "language_loss": 0.78320998, "learning_rate": 3.7464079944109904e-06, "loss": 0.80573583, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.6955080032348633 }, { "auxiliary_loss_clip": 0.01174933, "auxiliary_loss_mlp": 0.01036035, "balance_loss_clip": 1.05731547, "balance_loss_mlp": 1.02617645, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 1.817672367832006, "language_loss": 0.77580929, "learning_rate": 3.746028226322376e-06, "loss": 0.79791898, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 2.5665252208709717 }, { "auxiliary_loss_clip": 0.01187956, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.06078529, "balance_loss_mlp": 1.02747834, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 1.9069420471941214, "language_loss": 0.75631493, "learning_rate": 3.745648193362669e-06, "loss": 0.77855206, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.5902605056762695 }, { "auxiliary_loss_clip": 0.01192695, "auxiliary_loss_mlp": 0.01038514, "balance_loss_clip": 1.06400871, "balance_loss_mlp": 1.0293709, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 2.1280167972778536, "language_loss": 0.71972919, "learning_rate": 3.745267895589518e-06, "loss": 0.74204123, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.5557291507720947 }, { "auxiliary_loss_clip": 0.01189336, "auxiliary_loss_mlp": 0.01036951, "balance_loss_clip": 1.06040108, "balance_loss_mlp": 1.02689564, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 2.833461108890595, "language_loss": 0.82187605, "learning_rate": 3.7448873330606154e-06, "loss": 0.84413892, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.5465331077575684 }, { "auxiliary_loss_clip": 0.01169454, "auxiliary_loss_mlp": 0.01043611, "balance_loss_clip": 1.06006181, "balance_loss_mlp": 1.03392577, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 2.043775681049307, "language_loss": 0.87289125, "learning_rate": 3.7445065058336914e-06, "loss": 0.89502192, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.573452949523926 }, { "auxiliary_loss_clip": 0.0114474, "auxiliary_loss_mlp": 0.01035922, "balance_loss_clip": 1.05005646, "balance_loss_mlp": 1.02606368, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 1.9512708451196548, "language_loss": 0.86468744, "learning_rate": 3.7441254139665176e-06, "loss": 0.8864941, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.582707405090332 }, { "auxiliary_loss_clip": 0.01222995, "auxiliary_loss_mlp": 0.01041988, "balance_loss_clip": 1.06849694, "balance_loss_mlp": 1.03278482, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 1.6870894677479416, "language_loss": 0.82578194, "learning_rate": 3.743744057516905e-06, "loss": 0.84843183, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.4786689281463623 }, { "auxiliary_loss_clip": 0.01159889, "auxiliary_loss_mlp": 0.01050165, "balance_loss_clip": 1.05807924, "balance_loss_mlp": 1.03954375, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 2.915923157980633, "language_loss": 0.88036609, "learning_rate": 3.743362436542706e-06, "loss": 0.90246665, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.594353675842285 }, { "auxiliary_loss_clip": 0.01220944, "auxiliary_loss_mlp": 0.01045611, "balance_loss_clip": 1.06598806, "balance_loss_mlp": 1.03630042, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 1.7679283692267709, "language_loss": 0.76665384, "learning_rate": 3.7429805511018115e-06, "loss": 0.7893194, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.751323699951172 }, { "auxiliary_loss_clip": 0.01174665, "auxiliary_loss_mlp": 0.00763703, "balance_loss_clip": 1.06283772, "balance_loss_mlp": 0.99996382, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 1.6098371619079042, "language_loss": 0.7797001, "learning_rate": 3.7425984012521524e-06, "loss": 0.79908371, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.658365249633789 }, { "auxiliary_loss_clip": 0.01071946, "auxiliary_loss_mlp": 0.00753401, "balance_loss_clip": 1.02488101, "balance_loss_mlp": 1.00011468, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.7397093051307336, "language_loss": 0.60428119, "learning_rate": 3.7422159870517025e-06, "loss": 0.62253463, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.187368631362915 }, { "auxiliary_loss_clip": 0.01191388, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.06223154, "balance_loss_mlp": 1.03099895, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 1.6218824778168317, "language_loss": 0.79163098, "learning_rate": 3.7418333085584717e-06, "loss": 0.81394553, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.6207797527313232 }, { "auxiliary_loss_clip": 0.01180528, "auxiliary_loss_mlp": 0.01043096, "balance_loss_clip": 1.06318498, "balance_loss_mlp": 1.03335059, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 2.895595849773554, "language_loss": 0.90939426, "learning_rate": 3.7414503658305128e-06, "loss": 0.93163049, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.5801126956939697 }, { "auxiliary_loss_clip": 0.01168922, "auxiliary_loss_mlp": 0.01043105, "balance_loss_clip": 1.05825305, "balance_loss_mlp": 1.03319883, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.0154541266340518, "language_loss": 0.77537584, "learning_rate": 3.7410671589259185e-06, "loss": 0.79749608, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.660343647003174 }, { "auxiliary_loss_clip": 0.012247, "auxiliary_loss_mlp": 0.01040396, "balance_loss_clip": 1.06999385, "balance_loss_mlp": 1.03039408, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 1.9460538247130146, "language_loss": 0.79485863, "learning_rate": 3.7406836879028205e-06, "loss": 0.81750959, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.5058295726776123 }, { "auxiliary_loss_clip": 0.01205059, "auxiliary_loss_mlp": 0.01047407, "balance_loss_clip": 1.06544745, "balance_loss_mlp": 1.03811479, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 1.8514262518484188, "language_loss": 0.76701748, "learning_rate": 3.7402999528193907e-06, "loss": 0.7895422, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.5200045108795166 }, { "auxiliary_loss_clip": 0.01164583, "auxiliary_loss_mlp": 0.0076385, "balance_loss_clip": 1.06117094, "balance_loss_mlp": 0.99997497, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 2.225848764168749, "language_loss": 0.85295284, "learning_rate": 3.739915953733842e-06, "loss": 0.87223715, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 2.595884323120117 }, { "auxiliary_loss_clip": 0.0122293, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.06788445, "balance_loss_mlp": 1.0255233, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 1.5271483554484147, "language_loss": 0.81873727, "learning_rate": 3.7395316907044264e-06, "loss": 0.84131444, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 2.541637897491455 }, { "auxiliary_loss_clip": 0.01209155, "auxiliary_loss_mlp": 0.01048727, "balance_loss_clip": 1.06492496, "balance_loss_mlp": 1.03816485, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 1.6606104031430846, "language_loss": 0.79969609, "learning_rate": 3.7391471637894364e-06, "loss": 0.82227492, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 3.332841157913208 }, { "auxiliary_loss_clip": 0.01176275, "auxiliary_loss_mlp": 0.01037652, "balance_loss_clip": 1.05751526, "balance_loss_mlp": 1.02794242, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 1.7207416401348117, "language_loss": 0.85116196, "learning_rate": 3.738762373047205e-06, "loss": 0.87330127, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.6020920276641846 }, { "auxiliary_loss_clip": 0.01181895, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.06253266, "balance_loss_mlp": 1.03021073, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 1.5993358643671012, "language_loss": 0.83223462, "learning_rate": 3.738377318536103e-06, "loss": 0.85444707, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 3.3471055030822754 }, { "auxiliary_loss_clip": 0.01218784, "auxiliary_loss_mlp": 0.01035563, "balance_loss_clip": 1.06743145, "balance_loss_mlp": 1.02675939, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 2.2916791920256476, "language_loss": 0.70852053, "learning_rate": 3.7379920003145447e-06, "loss": 0.73106396, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 2.463214874267578 }, { "auxiliary_loss_clip": 0.0118767, "auxiliary_loss_mlp": 0.01034849, "balance_loss_clip": 1.06668365, "balance_loss_mlp": 1.02434111, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 1.6783597772581498, "language_loss": 0.83887434, "learning_rate": 3.7376064184409817e-06, "loss": 0.86109948, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 2.5786995887756348 }, { "auxiliary_loss_clip": 0.01189477, "auxiliary_loss_mlp": 0.01043785, "balance_loss_clip": 1.06370533, "balance_loss_mlp": 1.03353858, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 1.4417709694603642, "language_loss": 0.87085807, "learning_rate": 3.7372205729739063e-06, "loss": 0.89319062, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 3.31112003326416 }, { "auxiliary_loss_clip": 0.01211809, "auxiliary_loss_mlp": 0.01044859, "balance_loss_clip": 1.06601357, "balance_loss_mlp": 1.03453541, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 2.5928263941316683, "language_loss": 0.71894276, "learning_rate": 3.7368344639718514e-06, "loss": 0.7415095, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 3.2351958751678467 }, { "auxiliary_loss_clip": 0.01210686, "auxiliary_loss_mlp": 0.01034794, "balance_loss_clip": 1.06549561, "balance_loss_mlp": 1.02542377, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.6735957400698258, "language_loss": 0.80493921, "learning_rate": 3.7364480914933895e-06, "loss": 0.82739401, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.5721964836120605 }, { "auxiliary_loss_clip": 0.01161165, "auxiliary_loss_mlp": 0.00763551, "balance_loss_clip": 1.0606916, "balance_loss_mlp": 0.99995327, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 2.252631233473881, "language_loss": 0.81006652, "learning_rate": 3.7360614555971325e-06, "loss": 0.82931364, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 2.674231767654419 }, { "auxiliary_loss_clip": 0.01207129, "auxiliary_loss_mlp": 0.00762995, "balance_loss_clip": 1.06598055, "balance_loss_mlp": 0.99995089, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 1.9683190947197498, "language_loss": 0.85154605, "learning_rate": 3.735674556341733e-06, "loss": 0.87124729, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.5384464263916016 }, { "auxiliary_loss_clip": 0.01190474, "auxiliary_loss_mlp": 0.01039261, "balance_loss_clip": 1.06607461, "balance_loss_mlp": 1.02946222, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 1.8808456708347476, "language_loss": 0.82036543, "learning_rate": 3.7352873937858835e-06, "loss": 0.84266281, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.6135456562042236 }, { "auxiliary_loss_clip": 0.01172176, "auxiliary_loss_mlp": 0.00763291, "balance_loss_clip": 1.06119657, "balance_loss_mlp": 0.99996519, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 2.109215747995312, "language_loss": 0.71816301, "learning_rate": 3.734899967988316e-06, "loss": 0.73751771, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.6193926334381104 }, { "auxiliary_loss_clip": 0.01169248, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.05968571, "balance_loss_mlp": 1.02700996, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 1.8633460370060302, "language_loss": 0.83956748, "learning_rate": 3.7345122790078026e-06, "loss": 0.86161506, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.562445640563965 }, { "auxiliary_loss_clip": 0.01208098, "auxiliary_loss_mlp": 0.01048044, "balance_loss_clip": 1.0677706, "balance_loss_mlp": 1.0379169, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 3.3597897858932337, "language_loss": 0.92385173, "learning_rate": 3.7341243269031556e-06, "loss": 0.94641316, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.521916389465332 }, { "auxiliary_loss_clip": 0.0118601, "auxiliary_loss_mlp": 0.0104087, "balance_loss_clip": 1.06382263, "balance_loss_mlp": 1.03172624, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 1.9878148621655847, "language_loss": 0.77184314, "learning_rate": 3.7337361117332275e-06, "loss": 0.79411197, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.62974214553833 }, { "auxiliary_loss_clip": 0.01182797, "auxiliary_loss_mlp": 0.01042776, "balance_loss_clip": 1.06324339, "balance_loss_mlp": 1.03247666, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 2.767126878092092, "language_loss": 0.77270925, "learning_rate": 3.7333476335569087e-06, "loss": 0.79496503, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.567790985107422 }, { "auxiliary_loss_clip": 0.01194914, "auxiliary_loss_mlp": 0.01044931, "balance_loss_clip": 1.06686544, "balance_loss_mlp": 1.03414226, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 2.3652462505806624, "language_loss": 0.67202127, "learning_rate": 3.7329588924331325e-06, "loss": 0.69441968, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.593810558319092 }, { "auxiliary_loss_clip": 0.01169589, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.05703378, "balance_loss_mlp": 1.02609468, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 1.848251233538044, "language_loss": 0.82832348, "learning_rate": 3.732569888420871e-06, "loss": 0.8503716, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.5826942920684814 }, { "auxiliary_loss_clip": 0.01225083, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.06719661, "balance_loss_mlp": 1.03027487, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 3.0539324790950633, "language_loss": 0.82551259, "learning_rate": 3.732180621579134e-06, "loss": 0.84816921, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.5077199935913086 }, { "auxiliary_loss_clip": 0.01189182, "auxiliary_loss_mlp": 0.01039352, "balance_loss_clip": 1.06585526, "balance_loss_mlp": 1.02927291, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 2.152136303767593, "language_loss": 0.8138991, "learning_rate": 3.7317910919669745e-06, "loss": 0.83618438, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.680912494659424 }, { "auxiliary_loss_clip": 0.01210084, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.07068038, "balance_loss_mlp": 1.02973986, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.668108021249357, "language_loss": 0.76453, "learning_rate": 3.7314012996434826e-06, "loss": 0.78702569, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.5401010513305664 }, { "auxiliary_loss_clip": 0.01193579, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.06635487, "balance_loss_mlp": 1.02194691, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 2.1155611690859444, "language_loss": 0.80858648, "learning_rate": 3.7310112446677907e-06, "loss": 0.83083636, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.530593156814575 }, { "auxiliary_loss_clip": 0.01225033, "auxiliary_loss_mlp": 0.0104155, "balance_loss_clip": 1.06996918, "balance_loss_mlp": 1.03172743, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 2.175317961755692, "language_loss": 0.68722361, "learning_rate": 3.7306209270990695e-06, "loss": 0.70988941, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.4971094131469727 }, { "auxiliary_loss_clip": 0.01193571, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.06478572, "balance_loss_mlp": 1.02985954, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 1.8955949831120291, "language_loss": 0.86957085, "learning_rate": 3.7302303469965292e-06, "loss": 0.89189947, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.5892434120178223 }, { "auxiliary_loss_clip": 0.01209281, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.06941295, "balance_loss_mlp": 1.02390575, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 2.124731459402865, "language_loss": 0.70592016, "learning_rate": 3.7298395044194206e-06, "loss": 0.72835159, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 2.528857707977295 }, { "auxiliary_loss_clip": 0.01226768, "auxiliary_loss_mlp": 0.01036843, "balance_loss_clip": 1.07209706, "balance_loss_mlp": 1.02739549, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 1.785306764877352, "language_loss": 0.93947339, "learning_rate": 3.7294483994270356e-06, "loss": 0.96210945, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.49971342086792 }, { "auxiliary_loss_clip": 0.01157755, "auxiliary_loss_mlp": 0.01042009, "balance_loss_clip": 1.06000626, "balance_loss_mlp": 1.03283024, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 2.1367982376329238, "language_loss": 0.77881706, "learning_rate": 3.7290570320787033e-06, "loss": 0.80081475, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 3.3707187175750732 }, { "auxiliary_loss_clip": 0.01205087, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.06622124, "balance_loss_mlp": 1.0320065, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 2.066283884138619, "language_loss": 0.71182287, "learning_rate": 3.728665402433793e-06, "loss": 0.73429561, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.5080907344818115 }, { "auxiliary_loss_clip": 0.01190602, "auxiliary_loss_mlp": 0.0103903, "balance_loss_clip": 1.06432211, "balance_loss_mlp": 1.0298332, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.430601814237927, "language_loss": 0.85823929, "learning_rate": 3.7282735105517164e-06, "loss": 0.8805356, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 3.318866491317749 }, { "auxiliary_loss_clip": 0.01170947, "auxiliary_loss_mlp": 0.01044963, "balance_loss_clip": 1.06134415, "balance_loss_mlp": 1.0350033, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 2.2769409088068504, "language_loss": 0.67114073, "learning_rate": 3.727881356491922e-06, "loss": 0.69329983, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 2.6759657859802246 }, { "auxiliary_loss_clip": 0.01225077, "auxiliary_loss_mlp": 0.01033853, "balance_loss_clip": 1.07316947, "balance_loss_mlp": 1.02528811, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 1.9055238019129455, "language_loss": 0.75721776, "learning_rate": 3.7274889403139002e-06, "loss": 0.77980709, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 2.4606754779815674 }, { "auxiliary_loss_clip": 0.0115798, "auxiliary_loss_mlp": 0.01031068, "balance_loss_clip": 1.06199849, "balance_loss_mlp": 1.0219599, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 2.897975487281842, "language_loss": 0.78316689, "learning_rate": 3.727096262077179e-06, "loss": 0.80505735, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 3.421605110168457 }, { "auxiliary_loss_clip": 0.01207986, "auxiliary_loss_mlp": 0.0103925, "balance_loss_clip": 1.06710792, "balance_loss_mlp": 1.03016675, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 1.8330893357982525, "language_loss": 0.85523307, "learning_rate": 3.7267033218413285e-06, "loss": 0.87770545, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 3.309817314147949 }, { "auxiliary_loss_clip": 0.01147369, "auxiliary_loss_mlp": 0.01040209, "balance_loss_clip": 1.05417132, "balance_loss_mlp": 1.03060102, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.1407826113892283, "language_loss": 0.8104462, "learning_rate": 3.726310119665957e-06, "loss": 0.832322, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 2.6206576824188232 }, { "auxiliary_loss_clip": 0.01208705, "auxiliary_loss_mlp": 0.01034858, "balance_loss_clip": 1.0672524, "balance_loss_mlp": 1.02573848, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 1.94994621344314, "language_loss": 0.85252362, "learning_rate": 3.725916655610713e-06, "loss": 0.87495923, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.5416109561920166 }, { "auxiliary_loss_clip": 0.01180643, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.06076884, "balance_loss_mlp": 1.03018367, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 4.936361726402711, "language_loss": 0.75070357, "learning_rate": 3.725522929735284e-06, "loss": 0.7729072, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.5264620780944824 }, { "auxiliary_loss_clip": 0.01194711, "auxiliary_loss_mlp": 0.01042943, "balance_loss_clip": 1.06017208, "balance_loss_mlp": 1.03363323, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 1.9971649939096234, "language_loss": 0.74061704, "learning_rate": 3.725128942099399e-06, "loss": 0.76299357, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.6491646766662598 }, { "auxiliary_loss_clip": 0.0118019, "auxiliary_loss_mlp": 0.01037872, "balance_loss_clip": 1.05943346, "balance_loss_mlp": 1.02820396, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 1.6282666223723283, "language_loss": 0.8012684, "learning_rate": 3.7247346927628245e-06, "loss": 0.82344902, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.576286792755127 }, { "auxiliary_loss_clip": 0.01193712, "auxiliary_loss_mlp": 0.00763102, "balance_loss_clip": 1.06629205, "balance_loss_mlp": 0.99999571, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 1.7497663935025447, "language_loss": 0.79053062, "learning_rate": 3.7243401817853694e-06, "loss": 0.81009877, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.6022913455963135 }, { "auxiliary_loss_clip": 0.01199165, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.06402922, "balance_loss_mlp": 1.0266099, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 1.8315426792994132, "language_loss": 0.7178247, "learning_rate": 3.723945409226879e-06, "loss": 0.74016744, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.489342212677002 }, { "auxiliary_loss_clip": 0.0120247, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.06162024, "balance_loss_mlp": 1.02859926, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 2.2395806243731657, "language_loss": 0.80309802, "learning_rate": 3.723550375147241e-06, "loss": 0.82549584, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.595553398132324 }, { "auxiliary_loss_clip": 0.01164006, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.05850804, "balance_loss_mlp": 1.02150893, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 2.1438228125620475, "language_loss": 0.79932648, "learning_rate": 3.7231550796063816e-06, "loss": 0.82128167, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.6086628437042236 }, { "auxiliary_loss_clip": 0.01199231, "auxiliary_loss_mlp": 0.01039163, "balance_loss_clip": 1.0676837, "balance_loss_mlp": 1.02903616, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 2.124657731583981, "language_loss": 0.64883018, "learning_rate": 3.722759522664266e-06, "loss": 0.6712141, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.5219247341156006 }, { "auxiliary_loss_clip": 0.01165326, "auxiliary_loss_mlp": 0.01035637, "balance_loss_clip": 1.05960512, "balance_loss_mlp": 1.02610648, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 2.445517349566283, "language_loss": 0.81907588, "learning_rate": 3.7223637043809016e-06, "loss": 0.84108555, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.5876033306121826 }, { "auxiliary_loss_clip": 0.01181123, "auxiliary_loss_mlp": 0.01042436, "balance_loss_clip": 1.06553149, "balance_loss_mlp": 1.03360248, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 1.6982010173416828, "language_loss": 0.86391312, "learning_rate": 3.7219676248163322e-06, "loss": 0.88614875, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.59574818611145 }, { "auxiliary_loss_clip": 0.01211218, "auxiliary_loss_mlp": 0.0103989, "balance_loss_clip": 1.06813896, "balance_loss_mlp": 1.03050792, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 1.7038799929322694, "language_loss": 0.93443179, "learning_rate": 3.721571284030643e-06, "loss": 0.95694292, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.547821044921875 }, { "auxiliary_loss_clip": 0.01211485, "auxiliary_loss_mlp": 0.01040123, "balance_loss_clip": 1.06786156, "balance_loss_mlp": 1.03079462, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 2.502105022175523, "language_loss": 0.79054672, "learning_rate": 3.7211746820839587e-06, "loss": 0.81306279, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.510995864868164 }, { "auxiliary_loss_clip": 0.0111523, "auxiliary_loss_mlp": 0.01030042, "balance_loss_clip": 1.05225563, "balance_loss_mlp": 1.02107739, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 1.66729912829583, "language_loss": 0.80894327, "learning_rate": 3.7207778190364437e-06, "loss": 0.83039594, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.661425828933716 }, { "auxiliary_loss_clip": 0.01127833, "auxiliary_loss_mlp": 0.01043969, "balance_loss_clip": 1.05255485, "balance_loss_mlp": 1.03468239, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 1.516021560858203, "language_loss": 0.73783696, "learning_rate": 3.720380694948302e-06, "loss": 0.75955504, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 2.754500389099121 }, { "auxiliary_loss_clip": 0.01070753, "auxiliary_loss_mlp": 0.01003295, "balance_loss_clip": 1.02800202, "balance_loss_mlp": 1.00142312, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0450052395083482, "language_loss": 0.71250159, "learning_rate": 3.719983309879777e-06, "loss": 0.73324203, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.178575038909912 }, { "auxiliary_loss_clip": 0.01164572, "auxiliary_loss_mlp": 0.01046766, "balance_loss_clip": 1.05908418, "balance_loss_mlp": 1.0369668, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 1.7297579136428711, "language_loss": 0.77489305, "learning_rate": 3.719585663891151e-06, "loss": 0.79700649, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 2.5778322219848633 }, { "auxiliary_loss_clip": 0.01150409, "auxiliary_loss_mlp": 0.01040869, "balance_loss_clip": 1.05883801, "balance_loss_mlp": 1.03130853, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 2.06231860278537, "language_loss": 0.78809005, "learning_rate": 3.719187757042747e-06, "loss": 0.8100028, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 2.5879223346710205 }, { "auxiliary_loss_clip": 0.01087725, "auxiliary_loss_mlp": 0.01004702, "balance_loss_clip": 1.02703094, "balance_loss_mlp": 1.00281811, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7413128730078512, "language_loss": 0.55001438, "learning_rate": 3.7187895893949275e-06, "loss": 0.57093859, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 3.9408671855926514 }, { "auxiliary_loss_clip": 0.01143294, "auxiliary_loss_mlp": 0.01034143, "balance_loss_clip": 1.05248523, "balance_loss_mlp": 1.02475488, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.2904547130428776, "language_loss": 0.76186401, "learning_rate": 3.7183911610080937e-06, "loss": 0.78363836, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 2.638697385787964 }, { "auxiliary_loss_clip": 0.0117936, "auxiliary_loss_mlp": 0.01038143, "balance_loss_clip": 1.06250405, "balance_loss_mlp": 1.02808189, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 3.0567512892015296, "language_loss": 0.74920225, "learning_rate": 3.7179924719426872e-06, "loss": 0.77137721, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 3.418473482131958 }, { "auxiliary_loss_clip": 0.01211234, "auxiliary_loss_mlp": 0.01037349, "balance_loss_clip": 1.06916034, "balance_loss_mlp": 1.02675676, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 2.653027540645547, "language_loss": 0.76022184, "learning_rate": 3.7175935222591885e-06, "loss": 0.78270763, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 2.5463669300079346 }, { "auxiliary_loss_clip": 0.01196606, "auxiliary_loss_mlp": 0.01040632, "balance_loss_clip": 1.06804824, "balance_loss_mlp": 1.03114915, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 1.87296166965338, "language_loss": 0.74371636, "learning_rate": 3.717194312018118e-06, "loss": 0.76608878, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 2.6122450828552246 }, { "auxiliary_loss_clip": 0.01207141, "auxiliary_loss_mlp": 0.0103979, "balance_loss_clip": 1.06560934, "balance_loss_mlp": 1.03019929, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 1.9460597192089308, "language_loss": 0.75805688, "learning_rate": 3.716794841280036e-06, "loss": 0.78052628, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 3.272639274597168 }, { "auxiliary_loss_clip": 0.01210701, "auxiliary_loss_mlp": 0.01035661, "balance_loss_clip": 1.06453967, "balance_loss_mlp": 1.02564764, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 1.93153429877565, "language_loss": 0.7748543, "learning_rate": 3.7163951101055407e-06, "loss": 0.79731792, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 3.2295241355895996 }, { "auxiliary_loss_clip": 0.01184948, "auxiliary_loss_mlp": 0.01042427, "balance_loss_clip": 1.06239843, "balance_loss_mlp": 1.0324192, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 1.8455574489839723, "language_loss": 0.7878921, "learning_rate": 3.715995118555273e-06, "loss": 0.81016588, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.5671169757843018 }, { "auxiliary_loss_clip": 0.01154313, "auxiliary_loss_mlp": 0.01046557, "balance_loss_clip": 1.05686176, "balance_loss_mlp": 1.03705561, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 2.01976674002462, "language_loss": 0.86075938, "learning_rate": 3.71559486668991e-06, "loss": 0.88276809, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 2.6302835941314697 }, { "auxiliary_loss_clip": 0.0120871, "auxiliary_loss_mlp": 0.00762849, "balance_loss_clip": 1.06486833, "balance_loss_mlp": 1.00003982, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.8319770376628974, "language_loss": 0.7733829, "learning_rate": 3.715194354570169e-06, "loss": 0.79309857, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.5438408851623535 }, { "auxiliary_loss_clip": 0.01208172, "auxiliary_loss_mlp": 0.01038372, "balance_loss_clip": 1.06956387, "balance_loss_mlp": 1.02988458, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 1.829312273207489, "language_loss": 0.83151591, "learning_rate": 3.714793582256809e-06, "loss": 0.85398138, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.4935898780822754 }, { "auxiliary_loss_clip": 0.01222556, "auxiliary_loss_mlp": 0.01039566, "balance_loss_clip": 1.06954718, "balance_loss_mlp": 1.03054237, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.4409974170492883, "language_loss": 0.84953928, "learning_rate": 3.7143925498106253e-06, "loss": 0.87216055, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.482412815093994 }, { "auxiliary_loss_clip": 0.01189857, "auxiliary_loss_mlp": 0.01039213, "balance_loss_clip": 1.0598489, "balance_loss_mlp": 1.02907407, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 1.9602916486412927, "language_loss": 0.7920441, "learning_rate": 3.7139912572924558e-06, "loss": 0.81433481, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.569084882736206 }, { "auxiliary_loss_clip": 0.01201437, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.0610249, "balance_loss_mlp": 1.02381516, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 2.8047809100343, "language_loss": 0.80473369, "learning_rate": 3.7135897047631744e-06, "loss": 0.82708633, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.527642250061035 }, { "auxiliary_loss_clip": 0.01194726, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.06532276, "balance_loss_mlp": 1.02816117, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 2.1323922182323445, "language_loss": 0.7601124, "learning_rate": 3.713187892283698e-06, "loss": 0.78243089, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.5713937282562256 }, { "auxiliary_loss_clip": 0.01159388, "auxiliary_loss_mlp": 0.0103736, "balance_loss_clip": 1.05697334, "balance_loss_mlp": 1.02758515, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 2.1669940921877746, "language_loss": 0.87160707, "learning_rate": 3.71278581991498e-06, "loss": 0.8935746, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.57444429397583 }, { "auxiliary_loss_clip": 0.01177839, "auxiliary_loss_mlp": 0.00763737, "balance_loss_clip": 1.06376219, "balance_loss_mlp": 1.00004983, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 2.1251505445119108, "language_loss": 0.78275228, "learning_rate": 3.712383487718015e-06, "loss": 0.80216801, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.578662633895874 }, { "auxiliary_loss_clip": 0.0113545, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.05492628, "balance_loss_mlp": 1.0305531, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 1.8439776060040405, "language_loss": 0.86640251, "learning_rate": 3.7119808957538365e-06, "loss": 0.88815248, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.647707223892212 }, { "auxiliary_loss_clip": 0.01183152, "auxiliary_loss_mlp": 0.01044005, "balance_loss_clip": 1.05762625, "balance_loss_mlp": 1.03386641, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 1.9501377555327886, "language_loss": 0.80147415, "learning_rate": 3.711578044083517e-06, "loss": 0.82374573, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.547541856765747 }, { "auxiliary_loss_clip": 0.01190153, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.06017685, "balance_loss_mlp": 1.02786708, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 1.6616707598656286, "language_loss": 0.7458334, "learning_rate": 3.7111749327681698e-06, "loss": 0.76810741, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.5879123210906982 }, { "auxiliary_loss_clip": 0.01206919, "auxiliary_loss_mlp": 0.01032961, "balance_loss_clip": 1.06561279, "balance_loss_mlp": 1.02335286, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 2.4490809756660132, "language_loss": 0.85995662, "learning_rate": 3.7107715618689455e-06, "loss": 0.88235539, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.542264461517334 }, { "auxiliary_loss_clip": 0.01201459, "auxiliary_loss_mlp": 0.01037928, "balance_loss_clip": 1.06430554, "balance_loss_mlp": 1.02852809, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 1.5080095576724653, "language_loss": 0.83385187, "learning_rate": 3.710367931447035e-06, "loss": 0.85624576, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.5481879711151123 }, { "auxiliary_loss_clip": 0.01209605, "auxiliary_loss_mlp": 0.01045607, "balance_loss_clip": 1.06373739, "balance_loss_mlp": 1.03575397, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 3.453751346777361, "language_loss": 0.86955124, "learning_rate": 3.70996404156367e-06, "loss": 0.89210337, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.5325958728790283 }, { "auxiliary_loss_clip": 0.01142327, "auxiliary_loss_mlp": 0.01048036, "balance_loss_clip": 1.05143237, "balance_loss_mlp": 1.03858829, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 1.6790164296390193, "language_loss": 0.72803462, "learning_rate": 3.7095598922801187e-06, "loss": 0.74993825, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 2.7295308113098145 }, { "auxiliary_loss_clip": 0.01221923, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.06751752, "balance_loss_mlp": 1.02765775, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 3.150521696053154, "language_loss": 0.76295793, "learning_rate": 3.7091554836576914e-06, "loss": 0.78553832, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.487957239151001 }, { "auxiliary_loss_clip": 0.01202418, "auxiliary_loss_mlp": 0.00762309, "balance_loss_clip": 1.06686866, "balance_loss_mlp": 1.00005031, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 1.6855345552631666, "language_loss": 0.82677126, "learning_rate": 3.708750815757736e-06, "loss": 0.8464185, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 2.5743284225463867 }, { "auxiliary_loss_clip": 0.01202944, "auxiliary_loss_mlp": 0.01042441, "balance_loss_clip": 1.06324387, "balance_loss_mlp": 1.03341103, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 2.1469407760084045, "language_loss": 0.73211455, "learning_rate": 3.7083458886416407e-06, "loss": 0.7545684, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 3.2785933017730713 }, { "auxiliary_loss_clip": 0.01145236, "auxiliary_loss_mlp": 0.01035049, "balance_loss_clip": 1.05686259, "balance_loss_mlp": 1.0260253, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 2.352900313313421, "language_loss": 0.88229173, "learning_rate": 3.707940702370832e-06, "loss": 0.90409458, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 2.6687936782836914 }, { "auxiliary_loss_clip": 0.01091479, "auxiliary_loss_mlp": 0.01002835, "balance_loss_clip": 1.0230577, "balance_loss_mlp": 1.00076103, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.7577168363036942, "language_loss": 0.582515, "learning_rate": 3.707535257006777e-06, "loss": 0.60345817, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 3.9134440422058105 }, { "auxiliary_loss_clip": 0.0118635, "auxiliary_loss_mlp": 0.010381, "balance_loss_clip": 1.05930853, "balance_loss_mlp": 1.02892661, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 2.083139257330628, "language_loss": 0.88520992, "learning_rate": 3.707129552610981e-06, "loss": 0.90745437, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 2.520728349685669 }, { "auxiliary_loss_clip": 0.01182783, "auxiliary_loss_mlp": 0.01040741, "balance_loss_clip": 1.06299591, "balance_loss_mlp": 1.03122842, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 1.8843867576530986, "language_loss": 0.7371524, "learning_rate": 3.70672358924499e-06, "loss": 0.75938761, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 2.5416126251220703 }, { "auxiliary_loss_clip": 0.01173483, "auxiliary_loss_mlp": 0.0103659, "balance_loss_clip": 1.06268525, "balance_loss_mlp": 1.02751243, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 2.0047277355743516, "language_loss": 0.78372967, "learning_rate": 3.706317366970386e-06, "loss": 0.80583042, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 3.4792778491973877 }, { "auxiliary_loss_clip": 0.01220803, "auxiliary_loss_mlp": 0.00763386, "balance_loss_clip": 1.06524277, "balance_loss_mlp": 1.00005984, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 2.1702711357606814, "language_loss": 0.83417308, "learning_rate": 3.705910885848795e-06, "loss": 0.85401499, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 3.332139492034912 }, { "auxiliary_loss_clip": 0.01202558, "auxiliary_loss_mlp": 0.01036797, "balance_loss_clip": 1.06446171, "balance_loss_mlp": 1.02748656, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 2.012091285507101, "language_loss": 0.84191287, "learning_rate": 3.705504145941879e-06, "loss": 0.86430639, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.498910903930664 }, { "auxiliary_loss_clip": 0.01217105, "auxiliary_loss_mlp": 0.01041612, "balance_loss_clip": 1.06409335, "balance_loss_mlp": 1.03311491, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 1.7879300457031901, "language_loss": 0.79102981, "learning_rate": 3.7050971473113403e-06, "loss": 0.81361699, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.5092973709106445 }, { "auxiliary_loss_clip": 0.01199158, "auxiliary_loss_mlp": 0.00762087, "balance_loss_clip": 1.06132412, "balance_loss_mlp": 1.00007915, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 1.6653849155480214, "language_loss": 0.79995596, "learning_rate": 3.7046898900189196e-06, "loss": 0.8195684, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 2.7298309803009033 }, { "auxiliary_loss_clip": 0.01179859, "auxiliary_loss_mlp": 0.01044334, "balance_loss_clip": 1.06248331, "balance_loss_mlp": 1.03528631, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 1.8664789692871777, "language_loss": 0.82883871, "learning_rate": 3.704282374126398e-06, "loss": 0.85108066, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.6399664878845215 }, { "auxiliary_loss_clip": 0.01172609, "auxiliary_loss_mlp": 0.0104253, "balance_loss_clip": 1.05944324, "balance_loss_mlp": 1.03327298, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 1.7981491045014408, "language_loss": 0.87421274, "learning_rate": 3.7038745996955954e-06, "loss": 0.89636415, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 2.6242945194244385 }, { "auxiliary_loss_clip": 0.0117703, "auxiliary_loss_mlp": 0.0103785, "balance_loss_clip": 1.05757391, "balance_loss_mlp": 1.02793801, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 3.194060676773115, "language_loss": 0.71920693, "learning_rate": 3.703466566788371e-06, "loss": 0.74135572, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.580848217010498 }, { "auxiliary_loss_clip": 0.01179305, "auxiliary_loss_mlp": 0.01038413, "balance_loss_clip": 1.06178761, "balance_loss_mlp": 1.02901912, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 2.045467283379868, "language_loss": 0.74783653, "learning_rate": 3.703058275466622e-06, "loss": 0.77001369, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.6313648223876953 }, { "auxiliary_loss_clip": 0.01186387, "auxiliary_loss_mlp": 0.0103034, "balance_loss_clip": 1.06189454, "balance_loss_mlp": 1.022192, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 1.850712046730482, "language_loss": 0.77665269, "learning_rate": 3.7026497257922877e-06, "loss": 0.79881996, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.585545301437378 }, { "auxiliary_loss_clip": 0.01151382, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.0541929, "balance_loss_mlp": 1.02823484, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 1.599503241159991, "language_loss": 0.85068649, "learning_rate": 3.7022409178273436e-06, "loss": 0.87257373, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.6325223445892334 }, { "auxiliary_loss_clip": 0.01201321, "auxiliary_loss_mlp": 0.01036071, "balance_loss_clip": 1.06611466, "balance_loss_mlp": 1.02748168, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 1.9264878397785965, "language_loss": 0.78469884, "learning_rate": 3.7018318516338054e-06, "loss": 0.80707276, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.499721050262451 }, { "auxiliary_loss_clip": 0.01207686, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.06490457, "balance_loss_mlp": 1.02661657, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 2.2642014318600787, "language_loss": 0.81522083, "learning_rate": 3.7014225272737284e-06, "loss": 0.83766127, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.5328280925750732 }, { "auxiliary_loss_clip": 0.01195347, "auxiliary_loss_mlp": 0.01034361, "balance_loss_clip": 1.06195986, "balance_loss_mlp": 1.02537215, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 2.3783087943848202, "language_loss": 0.74130112, "learning_rate": 3.701012944809207e-06, "loss": 0.7635982, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.4908220767974854 }, { "auxiliary_loss_clip": 0.0119199, "auxiliary_loss_mlp": 0.00762498, "balance_loss_clip": 1.06557107, "balance_loss_mlp": 1.00011671, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 1.9696943352977963, "language_loss": 0.78453314, "learning_rate": 3.700603104302374e-06, "loss": 0.80407798, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.5978379249572754 }, { "auxiliary_loss_clip": 0.01071792, "auxiliary_loss_mlp": 0.01003369, "balance_loss_clip": 1.03631508, "balance_loss_mlp": 1.00125921, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.896387798126695, "language_loss": 0.56010562, "learning_rate": 3.7001930058154027e-06, "loss": 0.58085722, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.2054691314697266 }, { "auxiliary_loss_clip": 0.01172949, "auxiliary_loss_mlp": 0.01045689, "balance_loss_clip": 1.06077921, "balance_loss_mlp": 1.03511548, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 2.4347814001338848, "language_loss": 0.79867363, "learning_rate": 3.6997826494105037e-06, "loss": 0.82085997, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.640554904937744 }, { "auxiliary_loss_clip": 0.01191701, "auxiliary_loss_mlp": 0.01038693, "balance_loss_clip": 1.06434524, "balance_loss_mlp": 1.03025913, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 3.1893421698910664, "language_loss": 0.69436586, "learning_rate": 3.6993720351499286e-06, "loss": 0.7166698, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.606886625289917 }, { "auxiliary_loss_clip": 0.01183174, "auxiliary_loss_mlp": 0.01033745, "balance_loss_clip": 1.06399202, "balance_loss_mlp": 1.02531099, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 2.079768944813036, "language_loss": 0.77067965, "learning_rate": 3.6989611630959666e-06, "loss": 0.79284883, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.5936663150787354 }, { "auxiliary_loss_clip": 0.01094814, "auxiliary_loss_mlp": 0.01003755, "balance_loss_clip": 1.02603412, "balance_loss_mlp": 1.00166845, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 2.2544613137243386, "language_loss": 0.58294541, "learning_rate": 3.6985500333109474e-06, "loss": 0.60393101, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.1958117485046387 }, { "auxiliary_loss_clip": 0.01164829, "auxiliary_loss_mlp": 0.01033264, "balance_loss_clip": 1.05878007, "balance_loss_mlp": 1.02409649, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 2.454360213496597, "language_loss": 0.76511395, "learning_rate": 3.6981386458572385e-06, "loss": 0.78709489, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 3.439234972000122 }, { "auxiliary_loss_clip": 0.01170543, "auxiliary_loss_mlp": 0.01040399, "balance_loss_clip": 1.05943096, "balance_loss_mlp": 1.03136325, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 2.2694366270189676, "language_loss": 0.76112741, "learning_rate": 3.6977270007972468e-06, "loss": 0.78323686, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 2.595019578933716 }, { "auxiliary_loss_clip": 0.01194563, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.06538737, "balance_loss_mlp": 1.02456117, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 2.291652527149069, "language_loss": 0.72189021, "learning_rate": 3.6973150981934196e-06, "loss": 0.74416888, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 3.3767318725585938 }, { "auxiliary_loss_clip": 0.01222159, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.06777084, "balance_loss_mlp": 1.02334762, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 2.5265905680527463, "language_loss": 0.841084, "learning_rate": 3.6969029381082415e-06, "loss": 0.86363423, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 2.476741075515747 }, { "auxiliary_loss_clip": 0.01183947, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.06201458, "balance_loss_mlp": 1.02333033, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 1.9024268349402147, "language_loss": 0.79715955, "learning_rate": 3.696490520604237e-06, "loss": 0.8193146, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 2.5574145317077637 }, { "auxiliary_loss_clip": 0.01202508, "auxiliary_loss_mlp": 0.01038058, "balance_loss_clip": 1.06616604, "balance_loss_mlp": 1.02891493, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 1.6811910279075426, "language_loss": 0.8034243, "learning_rate": 3.696077845743968e-06, "loss": 0.82582998, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 2.5642013549804688 }, { "auxiliary_loss_clip": 0.01218386, "auxiliary_loss_mlp": 0.0104165, "balance_loss_clip": 1.06371903, "balance_loss_mlp": 1.03198767, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 2.7136734623903305, "language_loss": 0.73247087, "learning_rate": 3.69566491359004e-06, "loss": 0.75507116, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 3.2501134872436523 }, { "auxiliary_loss_clip": 0.01184917, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.05978477, "balance_loss_mlp": 1.03091502, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 2.2475290851518004, "language_loss": 0.69115651, "learning_rate": 3.695251724205092e-06, "loss": 0.71340775, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 3.58396577835083 }, { "auxiliary_loss_clip": 0.01218164, "auxiliary_loss_mlp": 0.01035276, "balance_loss_clip": 1.06554055, "balance_loss_mlp": 1.02623415, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 1.586489494444149, "language_loss": 0.86374056, "learning_rate": 3.6948382776518054e-06, "loss": 0.88627493, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 2.5361804962158203 }, { "auxiliary_loss_clip": 0.01178886, "auxiliary_loss_mlp": 0.01036376, "balance_loss_clip": 1.05820012, "balance_loss_mlp": 1.02660716, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 2.5735093798449413, "language_loss": 0.79383409, "learning_rate": 3.6944245739929e-06, "loss": 0.81598669, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.5313823223114014 }, { "auxiliary_loss_clip": 0.01205549, "auxiliary_loss_mlp": 0.01038811, "balance_loss_clip": 1.06417179, "balance_loss_mlp": 1.0289706, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 3.480369651290209, "language_loss": 0.72124946, "learning_rate": 3.6940106132911332e-06, "loss": 0.74369305, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.5325663089752197 }, { "auxiliary_loss_clip": 0.01201738, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.06228733, "balance_loss_mlp": 1.02374279, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 2.321356270082823, "language_loss": 0.88802993, "learning_rate": 3.6935963956093037e-06, "loss": 0.91037643, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.530231475830078 }, { "auxiliary_loss_clip": 0.01196941, "auxiliary_loss_mlp": 0.01042653, "balance_loss_clip": 1.06392837, "balance_loss_mlp": 1.03347969, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 1.9527062373056907, "language_loss": 0.68870354, "learning_rate": 3.6931819210102474e-06, "loss": 0.71109951, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.530916452407837 }, { "auxiliary_loss_clip": 0.01218143, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.06427014, "balance_loss_mlp": 1.02219915, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 1.7755316371720002, "language_loss": 0.84366202, "learning_rate": 3.6927671895568402e-06, "loss": 0.86615598, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.4939329624176025 }, { "auxiliary_loss_clip": 0.01223264, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.06919193, "balance_loss_mlp": 1.03336775, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 2.370514514673758, "language_loss": 0.87054586, "learning_rate": 3.692352201311996e-06, "loss": 0.89320648, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.477813959121704 }, { "auxiliary_loss_clip": 0.01165065, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.05643845, "balance_loss_mlp": 1.02477789, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 2.111977701821932, "language_loss": 0.76861882, "learning_rate": 3.6919369563386687e-06, "loss": 0.79061186, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.581807851791382 }, { "auxiliary_loss_clip": 0.01183342, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.06000626, "balance_loss_mlp": 1.03247857, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 3.1819032500554103, "language_loss": 0.78757375, "learning_rate": 3.69152145469985e-06, "loss": 0.80982107, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.5328786373138428 }, { "auxiliary_loss_clip": 0.01158971, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.05532157, "balance_loss_mlp": 1.02555537, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 1.9443057097346088, "language_loss": 0.81745565, "learning_rate": 3.691105696458572e-06, "loss": 0.83940738, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.6566383838653564 }, { "auxiliary_loss_clip": 0.01215741, "auxiliary_loss_mlp": 0.01032664, "balance_loss_clip": 1.06489623, "balance_loss_mlp": 1.02411664, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 2.5136253715035504, "language_loss": 0.68357927, "learning_rate": 3.690689681677904e-06, "loss": 0.70606333, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.5045814514160156 }, { "auxiliary_loss_clip": 0.01187778, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.05696249, "balance_loss_mlp": 1.02413237, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.7875445082910515, "language_loss": 0.88970387, "learning_rate": 3.690273410420956e-06, "loss": 0.91191089, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.5803349018096924 }, { "auxiliary_loss_clip": 0.0119846, "auxiliary_loss_mlp": 0.01043654, "balance_loss_clip": 1.06015813, "balance_loss_mlp": 1.03439784, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 2.260683465050588, "language_loss": 0.76718646, "learning_rate": 3.689856882750875e-06, "loss": 0.78960758, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.4736506938934326 }, { "auxiliary_loss_clip": 0.01199796, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.06316304, "balance_loss_mlp": 1.02629769, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 2.0535151041228574, "language_loss": 0.7834391, "learning_rate": 3.6894400987308486e-06, "loss": 0.80578595, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.4950199127197266 }, { "auxiliary_loss_clip": 0.01203361, "auxiliary_loss_mlp": 0.01036821, "balance_loss_clip": 1.06138933, "balance_loss_mlp": 1.02786279, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 1.9975758842294746, "language_loss": 0.84788382, "learning_rate": 3.6890230584241024e-06, "loss": 0.87028563, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.5253922939300537 }, { "auxiliary_loss_clip": 0.01106309, "auxiliary_loss_mlp": 0.0100488, "balance_loss_clip": 1.02346671, "balance_loss_mlp": 1.00254393, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.0356248958730427, "language_loss": 0.66446781, "learning_rate": 3.6886057618939016e-06, "loss": 0.68557978, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.1514835357666016 }, { "auxiliary_loss_clip": 0.01163414, "auxiliary_loss_mlp": 0.01038612, "balance_loss_clip": 1.05545211, "balance_loss_mlp": 1.02843714, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 2.0031599343588726, "language_loss": 0.69369829, "learning_rate": 3.6881882092035492e-06, "loss": 0.71571851, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 2.730560779571533 }, { "auxiliary_loss_clip": 0.01076705, "auxiliary_loss_mlp": 0.00753001, "balance_loss_clip": 1.02303708, "balance_loss_mlp": 1.00019574, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9263532792700441, "language_loss": 0.61256588, "learning_rate": 3.6877704004163873e-06, "loss": 0.63086295, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.3109655380249023 }, { "auxiliary_loss_clip": 0.01218918, "auxiliary_loss_mlp": 0.01040054, "balance_loss_clip": 1.06384122, "balance_loss_mlp": 1.03009963, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 2.9563300445726863, "language_loss": 0.77848095, "learning_rate": 3.6873523355957984e-06, "loss": 0.80107069, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 3.2323403358459473 }, { "auxiliary_loss_clip": 0.01102912, "auxiliary_loss_mlp": 0.01006345, "balance_loss_clip": 1.02033496, "balance_loss_mlp": 1.00409162, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 0.9818722437470498, "language_loss": 0.64074767, "learning_rate": 3.686934014805201e-06, "loss": 0.66184032, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 2.9311726093292236 }, { "auxiliary_loss_clip": 0.01199893, "auxiliary_loss_mlp": 0.01043211, "balance_loss_clip": 1.06301522, "balance_loss_mlp": 1.03378129, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 1.8631498404291655, "language_loss": 0.80870032, "learning_rate": 3.6865154381080552e-06, "loss": 0.83113134, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 2.5563607215881348 }, { "auxiliary_loss_clip": 0.01126515, "auxiliary_loss_mlp": 0.01040634, "balance_loss_clip": 1.05098343, "balance_loss_mlp": 1.03159153, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 3.1428348155918586, "language_loss": 0.82789433, "learning_rate": 3.6860966055678585e-06, "loss": 0.84956574, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 3.4999380111694336 }, { "auxiliary_loss_clip": 0.0120392, "auxiliary_loss_mlp": 0.01036333, "balance_loss_clip": 1.06402504, "balance_loss_mlp": 1.02706456, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 1.8090437184010248, "language_loss": 0.86228991, "learning_rate": 3.685677517248147e-06, "loss": 0.88469243, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 2.6079885959625244 }, { "auxiliary_loss_clip": 0.01187798, "auxiliary_loss_mlp": 0.00762274, "balance_loss_clip": 1.06430948, "balance_loss_mlp": 1.00022507, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 1.8883669939319985, "language_loss": 0.80434602, "learning_rate": 3.6852581732124967e-06, "loss": 0.82384682, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 2.522298574447632 }, { "auxiliary_loss_clip": 0.01203474, "auxiliary_loss_mlp": 0.01042497, "balance_loss_clip": 1.06396461, "balance_loss_mlp": 1.03261411, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 2.136583206972171, "language_loss": 0.76482683, "learning_rate": 3.6848385735245213e-06, "loss": 0.78728658, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 3.4694912433624268 }, { "auxiliary_loss_clip": 0.01183031, "auxiliary_loss_mlp": 0.01033961, "balance_loss_clip": 1.05427146, "balance_loss_mlp": 1.02519298, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 1.7498946892310732, "language_loss": 0.86333978, "learning_rate": 3.6844187182478734e-06, "loss": 0.88550973, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 3.3554019927978516 }, { "auxiliary_loss_clip": 0.01174061, "auxiliary_loss_mlp": 0.01039323, "balance_loss_clip": 1.0572331, "balance_loss_mlp": 1.0301497, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 2.0531090594482104, "language_loss": 0.75014567, "learning_rate": 3.683998607446246e-06, "loss": 0.7722795, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.5609142780303955 }, { "auxiliary_loss_clip": 0.01203878, "auxiliary_loss_mlp": 0.0104051, "balance_loss_clip": 1.06466377, "balance_loss_mlp": 1.03117621, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 1.9307027911850765, "language_loss": 0.74819231, "learning_rate": 3.6835782411833686e-06, "loss": 0.77063614, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 2.5156447887420654 }, { "auxiliary_loss_clip": 0.01158779, "auxiliary_loss_mlp": 0.01033332, "balance_loss_clip": 1.05409908, "balance_loss_mlp": 1.024755, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.83381027900753, "language_loss": 0.74186343, "learning_rate": 3.68315761952301e-06, "loss": 0.76378453, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 2.5801353454589844 }, { "auxiliary_loss_clip": 0.01214273, "auxiliary_loss_mlp": 0.01035494, "balance_loss_clip": 1.06185961, "balance_loss_mlp": 1.02588546, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 1.9485976866747476, "language_loss": 0.82769096, "learning_rate": 3.6827367425289797e-06, "loss": 0.85018861, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.518188714981079 }, { "auxiliary_loss_clip": 0.01185074, "auxiliary_loss_mlp": 0.01042457, "balance_loss_clip": 1.06014419, "balance_loss_mlp": 1.03264034, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 2.175348423475904, "language_loss": 0.72804713, "learning_rate": 3.6823156102651225e-06, "loss": 0.75032246, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.5415356159210205 }, { "auxiliary_loss_clip": 0.01133626, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.05872178, "balance_loss_mlp": 1.02353668, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 1.7721069288658966, "language_loss": 0.71065778, "learning_rate": 3.6818942227953257e-06, "loss": 0.73232049, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.662010669708252 }, { "auxiliary_loss_clip": 0.01169801, "auxiliary_loss_mlp": 0.01036298, "balance_loss_clip": 1.05710077, "balance_loss_mlp": 1.02729201, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 2.824139918896068, "language_loss": 0.6921221, "learning_rate": 3.681472580183512e-06, "loss": 0.71418309, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.5904173851013184 }, { "auxiliary_loss_clip": 0.01196122, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.06124151, "balance_loss_mlp": 1.02428722, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 1.895962039055488, "language_loss": 0.86372888, "learning_rate": 3.6810506824936455e-06, "loss": 0.88601828, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.5047531127929688 }, { "auxiliary_loss_clip": 0.01078519, "auxiliary_loss_mlp": 0.01002905, "balance_loss_clip": 1.01857162, "balance_loss_mlp": 1.00084293, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.0545596526174006, "language_loss": 0.62560916, "learning_rate": 3.680628529789726e-06, "loss": 0.64642334, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 2.978342294692993 }, { "auxiliary_loss_clip": 0.01221991, "auxiliary_loss_mlp": 0.01047492, "balance_loss_clip": 1.06564212, "balance_loss_mlp": 1.03806221, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 2.377839410064548, "language_loss": 0.86219734, "learning_rate": 3.680206122135796e-06, "loss": 0.88489223, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.5234663486480713 }, { "auxiliary_loss_clip": 0.01161579, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.05980933, "balance_loss_mlp": 1.03054905, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 2.078232943449261, "language_loss": 0.78592545, "learning_rate": 3.6797834595959323e-06, "loss": 0.80793047, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.6566221714019775 }, { "auxiliary_loss_clip": 0.01147234, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.05285668, "balance_loss_mlp": 1.02524281, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 3.6089929127875875, "language_loss": 0.77825701, "learning_rate": 3.679360542234254e-06, "loss": 0.80007434, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.6744298934936523 }, { "auxiliary_loss_clip": 0.01177262, "auxiliary_loss_mlp": 0.00762241, "balance_loss_clip": 1.05568838, "balance_loss_mlp": 1.00030398, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 1.618609591984291, "language_loss": 0.72196198, "learning_rate": 3.678937370114916e-06, "loss": 0.74135697, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.621530771255493 }, { "auxiliary_loss_clip": 0.01179385, "auxiliary_loss_mlp": 0.01040664, "balance_loss_clip": 1.06071126, "balance_loss_mlp": 1.03249204, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 1.9229893735302308, "language_loss": 0.78967357, "learning_rate": 3.678513943302114e-06, "loss": 0.81187403, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.523022174835205 }, { "auxiliary_loss_clip": 0.01212903, "auxiliary_loss_mlp": 0.0104172, "balance_loss_clip": 1.06368637, "balance_loss_mlp": 1.03290391, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 1.6969447698909321, "language_loss": 0.85079837, "learning_rate": 3.678090261860082e-06, "loss": 0.8733446, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.4954237937927246 }, { "auxiliary_loss_clip": 0.01171217, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 1.05476785, "balance_loss_mlp": 1.02708769, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 1.8891067464542552, "language_loss": 0.77366281, "learning_rate": 3.6776663258530906e-06, "loss": 0.79573333, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.6096787452697754 }, { "auxiliary_loss_clip": 0.01201233, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.06044686, "balance_loss_mlp": 1.02685523, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 2.0466398502185243, "language_loss": 0.71261871, "learning_rate": 3.6772421353454516e-06, "loss": 0.73498607, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 2.512300968170166 }, { "auxiliary_loss_clip": 0.01199164, "auxiliary_loss_mlp": 0.01037274, "balance_loss_clip": 1.06557655, "balance_loss_mlp": 1.02787423, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 1.8845983484778162, "language_loss": 0.88554311, "learning_rate": 3.6768176904015153e-06, "loss": 0.90790749, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.5216846466064453 }, { "auxiliary_loss_clip": 0.01200737, "auxiliary_loss_mlp": 0.0103806, "balance_loss_clip": 1.06084418, "balance_loss_mlp": 1.02941179, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 1.9541519116114452, "language_loss": 0.60291141, "learning_rate": 3.6763929910856674e-06, "loss": 0.62529945, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 3.3664777278900146 }, { "auxiliary_loss_clip": 0.01197779, "auxiliary_loss_mlp": 0.0103823, "balance_loss_clip": 1.06388271, "balance_loss_mlp": 1.02918792, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.341017024782291, "language_loss": 0.77822649, "learning_rate": 3.6759680374623365e-06, "loss": 0.80058658, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 2.495476484298706 }, { "auxiliary_loss_clip": 0.01211085, "auxiliary_loss_mlp": 0.01036754, "balance_loss_clip": 1.06361043, "balance_loss_mlp": 1.02725291, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 2.580660846124239, "language_loss": 0.7550745, "learning_rate": 3.675542829595986e-06, "loss": 0.77755284, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 2.5246336460113525 }, { "auxiliary_loss_clip": 0.01186363, "auxiliary_loss_mlp": 0.01031412, "balance_loss_clip": 1.0611403, "balance_loss_mlp": 1.02273965, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 1.5822633034063123, "language_loss": 0.79310966, "learning_rate": 3.6751173675511213e-06, "loss": 0.81528735, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 3.3607327938079834 }, { "auxiliary_loss_clip": 0.01180736, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.05496752, "balance_loss_mlp": 1.02546144, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.0361543056978677, "language_loss": 0.87288034, "learning_rate": 3.674691651392283e-06, "loss": 0.89503175, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 2.542043447494507 }, { "auxiliary_loss_clip": 0.01189124, "auxiliary_loss_mlp": 0.01032989, "balance_loss_clip": 1.06147313, "balance_loss_mlp": 1.02425122, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 2.122224133722387, "language_loss": 0.76095772, "learning_rate": 3.674265681184053e-06, "loss": 0.78317893, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 2.6891977787017822 }, { "auxiliary_loss_clip": 0.01184639, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.05818093, "balance_loss_mlp": 1.02988672, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 1.933178039622886, "language_loss": 0.86454254, "learning_rate": 3.6738394569910504e-06, "loss": 0.88677609, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 3.3399436473846436 }, { "auxiliary_loss_clip": 0.01196359, "auxiliary_loss_mlp": 0.01046046, "balance_loss_clip": 1.06164217, "balance_loss_mlp": 1.03602087, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 2.0957394326222687, "language_loss": 0.82614797, "learning_rate": 3.6734129788779333e-06, "loss": 0.84857202, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 3.2663323879241943 }, { "auxiliary_loss_clip": 0.01169695, "auxiliary_loss_mlp": 0.01032764, "balance_loss_clip": 1.06135154, "balance_loss_mlp": 1.02441347, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 3.225955623691178, "language_loss": 0.90261054, "learning_rate": 3.6729862469093976e-06, "loss": 0.92463511, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 2.5885255336761475 }, { "auxiliary_loss_clip": 0.01171235, "auxiliary_loss_mlp": 0.0102762, "balance_loss_clip": 1.05731225, "balance_loss_mlp": 1.01937056, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.1544044735113683, "language_loss": 0.82850415, "learning_rate": 3.6725592611501782e-06, "loss": 0.85049272, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.5705885887145996 }, { "auxiliary_loss_clip": 0.01199459, "auxiliary_loss_mlp": 0.01032349, "balance_loss_clip": 1.06151414, "balance_loss_mlp": 1.02327132, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 2.104087499010352, "language_loss": 0.7617951, "learning_rate": 3.6721320216650496e-06, "loss": 0.78411317, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.5975582599639893 }, { "auxiliary_loss_clip": 0.01181877, "auxiliary_loss_mlp": 0.01033943, "balance_loss_clip": 1.0590353, "balance_loss_mlp": 1.0250138, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 1.9347283012239143, "language_loss": 0.83501828, "learning_rate": 3.6717045285188215e-06, "loss": 0.85717648, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.606879711151123 }, { "auxiliary_loss_clip": 0.01137058, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.0484798, "balance_loss_mlp": 1.02891529, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 2.113129818075374, "language_loss": 0.86752963, "learning_rate": 3.671276781776346e-06, "loss": 0.88928282, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.6138579845428467 }, { "auxiliary_loss_clip": 0.01175491, "auxiliary_loss_mlp": 0.01039342, "balance_loss_clip": 1.05578148, "balance_loss_mlp": 1.03059816, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 2.347298010821068, "language_loss": 0.66878963, "learning_rate": 3.6708487815025128e-06, "loss": 0.69093794, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.6337168216705322 }, { "auxiliary_loss_clip": 0.01168642, "auxiliary_loss_mlp": 0.01036007, "balance_loss_clip": 1.05750024, "balance_loss_mlp": 1.02690554, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 3.6620819286201405, "language_loss": 0.74346453, "learning_rate": 3.6704205277622463e-06, "loss": 0.76551104, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.5538344383239746 }, { "auxiliary_loss_clip": 0.01186533, "auxiliary_loss_mlp": 0.01033548, "balance_loss_clip": 1.05764127, "balance_loss_mlp": 1.02473259, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 1.7742188297390749, "language_loss": 0.80347633, "learning_rate": 3.6699920206205146e-06, "loss": 0.8256771, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.601717948913574 }, { "auxiliary_loss_clip": 0.01200015, "auxiliary_loss_mlp": 0.01035305, "balance_loss_clip": 1.06049395, "balance_loss_mlp": 1.02693677, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 1.669481246917338, "language_loss": 0.82206702, "learning_rate": 3.669563260142321e-06, "loss": 0.84442025, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.5198159217834473 }, { "auxiliary_loss_clip": 0.01181103, "auxiliary_loss_mlp": 0.01040101, "balance_loss_clip": 1.0599823, "balance_loss_mlp": 1.03022408, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 2.0743231561586066, "language_loss": 0.84197032, "learning_rate": 3.6691342463927083e-06, "loss": 0.86418235, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.5584867000579834 }, { "auxiliary_loss_clip": 0.01170852, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.05620074, "balance_loss_mlp": 1.03003311, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 1.768047482736279, "language_loss": 0.81989002, "learning_rate": 3.668704979436758e-06, "loss": 0.84198958, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.6231143474578857 }, { "auxiliary_loss_clip": 0.01174486, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.05513716, "balance_loss_mlp": 1.02428806, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 2.117796100332401, "language_loss": 0.79207546, "learning_rate": 3.668275459339588e-06, "loss": 0.81415874, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.53269624710083 }, { "auxiliary_loss_clip": 0.01211514, "auxiliary_loss_mlp": 0.01036415, "balance_loss_clip": 1.06376147, "balance_loss_mlp": 1.02796936, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 1.9025955401998134, "language_loss": 0.8034057, "learning_rate": 3.667845686166358e-06, "loss": 0.82588506, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.4512040615081787 }, { "auxiliary_loss_clip": 0.01150591, "auxiliary_loss_mlp": 0.01036908, "balance_loss_clip": 1.05257213, "balance_loss_mlp": 1.02765119, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 1.6970086128558828, "language_loss": 0.86149675, "learning_rate": 3.6674156599822634e-06, "loss": 0.88337171, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.5544517040252686 }, { "auxiliary_loss_clip": 0.01156551, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.05152369, "balance_loss_mlp": 1.03047061, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 1.9932189998018164, "language_loss": 0.81789386, "learning_rate": 3.666985380852539e-06, "loss": 0.83986413, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.6283626556396484 }, { "auxiliary_loss_clip": 0.01180695, "auxiliary_loss_mlp": 0.01034626, "balance_loss_clip": 1.0575819, "balance_loss_mlp": 1.02572107, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 2.2774132257437825, "language_loss": 0.73731703, "learning_rate": 3.6665548488424576e-06, "loss": 0.75947022, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.609576463699341 }, { "auxiliary_loss_clip": 0.01211866, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.0618242, "balance_loss_mlp": 1.0264554, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 1.6502455120307287, "language_loss": 0.87988019, "learning_rate": 3.6661240640173307e-06, "loss": 0.90235984, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.5311052799224854 }, { "auxiliary_loss_clip": 0.0106792, "auxiliary_loss_mlp": 0.01005612, "balance_loss_clip": 1.01985919, "balance_loss_mlp": 1.00339496, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 0.8594720847701138, "language_loss": 0.57869363, "learning_rate": 3.6656930264425085e-06, "loss": 0.59942895, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.2043297290802 }, { "auxiliary_loss_clip": 0.01214455, "auxiliary_loss_mlp": 0.01035317, "balance_loss_clip": 1.06466246, "balance_loss_mlp": 1.02716327, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 1.826455745303862, "language_loss": 0.75574994, "learning_rate": 3.665261736183378e-06, "loss": 0.77824765, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 3.245326519012451 }, { "auxiliary_loss_clip": 0.01165153, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.0557847, "balance_loss_mlp": 1.03010523, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 3.5767141692528144, "language_loss": 0.88900101, "learning_rate": 3.664830193305366e-06, "loss": 0.91104645, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 2.5554144382476807 }, { "auxiliary_loss_clip": 0.01163062, "auxiliary_loss_mlp": 0.01039473, "balance_loss_clip": 1.05455494, "balance_loss_mlp": 1.03062749, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 3.2110263341744423, "language_loss": 0.77210337, "learning_rate": 3.6643983978739373e-06, "loss": 0.79412872, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 2.576718330383301 }, { "auxiliary_loss_clip": 0.01177733, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.0604732, "balance_loss_mlp": 1.02594078, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 1.8000282599774204, "language_loss": 0.81955791, "learning_rate": 3.663966349954596e-06, "loss": 0.84168792, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 3.3607428073883057 }, { "auxiliary_loss_clip": 0.01093449, "auxiliary_loss_mlp": 0.01002939, "balance_loss_clip": 1.02167797, "balance_loss_mlp": 1.00067377, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7918078789410999, "language_loss": 0.59729487, "learning_rate": 3.6635340496128816e-06, "loss": 0.61825877, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 3.0550057888031006 }, { "auxiliary_loss_clip": 0.01149398, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.05496418, "balance_loss_mlp": 1.02948928, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 2.015024738659513, "language_loss": 0.92733908, "learning_rate": 3.6631014969143747e-06, "loss": 0.94920641, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 2.6127772331237793 }, { "auxiliary_loss_clip": 0.01199947, "auxiliary_loss_mlp": 0.01039268, "balance_loss_clip": 1.06366396, "balance_loss_mlp": 1.03060174, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 2.0612919908097416, "language_loss": 0.88741601, "learning_rate": 3.662668691924693e-06, "loss": 0.90980816, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 3.279811143875122 }, { "auxiliary_loss_clip": 0.0116441, "auxiliary_loss_mlp": 0.01032685, "balance_loss_clip": 1.05342889, "balance_loss_mlp": 1.02340424, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 1.8824011975507167, "language_loss": 0.71050769, "learning_rate": 3.6622356347094927e-06, "loss": 0.73247862, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 3.430847406387329 }, { "auxiliary_loss_clip": 0.01168173, "auxiliary_loss_mlp": 0.01033434, "balance_loss_clip": 1.05583835, "balance_loss_mlp": 1.02383709, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 2.202802816630925, "language_loss": 0.78354013, "learning_rate": 3.6618023253344684e-06, "loss": 0.80555618, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.6374897956848145 }, { "auxiliary_loss_clip": 0.01194786, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.0573498, "balance_loss_mlp": 1.01867747, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 2.5211904019408253, "language_loss": 0.83481383, "learning_rate": 3.6613687638653527e-06, "loss": 0.85704935, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.4871089458465576 }, { "auxiliary_loss_clip": 0.01180806, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.06087995, "balance_loss_mlp": 1.02729046, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 1.9057748826647214, "language_loss": 0.77999997, "learning_rate": 3.660934950367916e-06, "loss": 0.8021757, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.591860055923462 }, { "auxiliary_loss_clip": 0.01198786, "auxiliary_loss_mlp": 0.0103524, "balance_loss_clip": 1.06095779, "balance_loss_mlp": 1.02675867, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 1.7077428121956286, "language_loss": 0.83534491, "learning_rate": 3.660500884907968e-06, "loss": 0.85768521, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.5368566513061523 }, { "auxiliary_loss_clip": 0.01063548, "auxiliary_loss_mlp": 0.01003221, "balance_loss_clip": 1.02507973, "balance_loss_mlp": 1.00117064, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.8290222204009497, "language_loss": 0.60101128, "learning_rate": 3.660066567551356e-06, "loss": 0.62167895, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.111382484436035 }, { "auxiliary_loss_clip": 0.01199566, "auxiliary_loss_mlp": 0.00762044, "balance_loss_clip": 1.06232643, "balance_loss_mlp": 1.00023913, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 4.05603683293989, "language_loss": 0.84390146, "learning_rate": 3.6596319983639657e-06, "loss": 0.86351752, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.5459229946136475 }, { "auxiliary_loss_clip": 0.01166532, "auxiliary_loss_mlp": 0.00762859, "balance_loss_clip": 1.05650389, "balance_loss_mlp": 1.00025606, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.5920855573543164, "language_loss": 0.86296868, "learning_rate": 3.6591971774117214e-06, "loss": 0.88226265, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.653820753097534 }, { "auxiliary_loss_clip": 0.01204382, "auxiliary_loss_mlp": 0.01041102, "balance_loss_clip": 1.06254399, "balance_loss_mlp": 1.03341889, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 1.991834400596459, "language_loss": 0.80474454, "learning_rate": 3.6587621047605833e-06, "loss": 0.82719934, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.5018656253814697 }, { "auxiliary_loss_clip": 0.01196055, "auxiliary_loss_mlp": 0.01033415, "balance_loss_clip": 1.06027615, "balance_loss_mlp": 1.02423608, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 2.01765581731561, "language_loss": 0.86696136, "learning_rate": 3.6583267804765542e-06, "loss": 0.88925606, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.4914960861206055 }, { "auxiliary_loss_clip": 0.0119632, "auxiliary_loss_mlp": 0.01030647, "balance_loss_clip": 1.06096101, "balance_loss_mlp": 1.02095556, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 2.3437876121451975, "language_loss": 0.85573518, "learning_rate": 3.6578912046256702e-06, "loss": 0.87800491, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.517674207687378 }, { "auxiliary_loss_clip": 0.01165609, "auxiliary_loss_mlp": 0.01040269, "balance_loss_clip": 1.05679083, "balance_loss_mlp": 1.03125644, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 2.221147308565393, "language_loss": 0.7601577, "learning_rate": 3.6574553772740083e-06, "loss": 0.78221655, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.5666632652282715 }, { "auxiliary_loss_clip": 0.01088109, "auxiliary_loss_mlp": 0.01005706, "balance_loss_clip": 1.02077961, "balance_loss_mlp": 1.00334537, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 1.1157005736514312, "language_loss": 0.61874032, "learning_rate": 3.657019298487684e-06, "loss": 0.63967842, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.1777141094207764 }, { "auxiliary_loss_clip": 0.01201158, "auxiliary_loss_mlp": 0.00762977, "balance_loss_clip": 1.05973494, "balance_loss_mlp": 1.00027299, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 2.060449031070092, "language_loss": 0.83694029, "learning_rate": 3.6565829683328495e-06, "loss": 0.85658169, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.6400482654571533 }, { "auxiliary_loss_clip": 0.01193776, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.0615021, "balance_loss_mlp": 1.02396619, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 1.8494887624061311, "language_loss": 0.86353528, "learning_rate": 3.6561463868756965e-06, "loss": 0.88579714, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.5014660358428955 }, { "auxiliary_loss_clip": 0.0119847, "auxiliary_loss_mlp": 0.01034901, "balance_loss_clip": 1.06270909, "balance_loss_mlp": 1.02605534, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 2.3285017347123387, "language_loss": 0.78146219, "learning_rate": 3.655709554182452e-06, "loss": 0.80379593, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.573662519454956 }, { "auxiliary_loss_clip": 0.01196722, "auxiliary_loss_mlp": 0.01029774, "balance_loss_clip": 1.0575552, "balance_loss_mlp": 1.02014208, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 2.16832471942644, "language_loss": 0.84347314, "learning_rate": 3.6552724703193855e-06, "loss": 0.86573809, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.495096445083618 }, { "auxiliary_loss_clip": 0.01054051, "auxiliary_loss_mlp": 0.01008153, "balance_loss_clip": 1.01822448, "balance_loss_mlp": 1.00574481, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.7876101690633913, "language_loss": 0.55965704, "learning_rate": 3.654835135352801e-06, "loss": 0.58027905, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.070044994354248 }, { "auxiliary_loss_clip": 0.01151819, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.05004549, "balance_loss_mlp": 1.02528167, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 1.8644097719002517, "language_loss": 0.87401688, "learning_rate": 3.654397549349043e-06, "loss": 0.8958776, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 3.442904233932495 }, { "auxiliary_loss_clip": 0.0118353, "auxiliary_loss_mlp": 0.0104014, "balance_loss_clip": 1.06292689, "balance_loss_mlp": 1.03037643, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 2.1340067015272655, "language_loss": 0.75092316, "learning_rate": 3.653959712374491e-06, "loss": 0.77315986, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 2.5465593338012695 }, { "auxiliary_loss_clip": 0.01162182, "auxiliary_loss_mlp": 0.010364, "balance_loss_clip": 1.05793619, "balance_loss_mlp": 1.02683949, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 1.9170927765204293, "language_loss": 0.82445294, "learning_rate": 3.6535216244955663e-06, "loss": 0.84643877, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 2.5742788314819336 }, { "auxiliary_loss_clip": 0.0118132, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.05824876, "balance_loss_mlp": 1.02783263, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 2.0606889487714652, "language_loss": 0.71236056, "learning_rate": 3.653083285778726e-06, "loss": 0.7345438, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 2.6426258087158203 }, { "auxiliary_loss_clip": 0.01204719, "auxiliary_loss_mlp": 0.01037674, "balance_loss_clip": 1.06204844, "balance_loss_mlp": 1.02829874, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 2.660373378007314, "language_loss": 0.81390363, "learning_rate": 3.6526446962904653e-06, "loss": 0.83632755, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 3.309589385986328 }, { "auxiliary_loss_clip": 0.01195731, "auxiliary_loss_mlp": 0.01038682, "balance_loss_clip": 1.06228411, "balance_loss_mlp": 1.02953815, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.6933737999206764, "language_loss": 0.74126923, "learning_rate": 3.652205856097318e-06, "loss": 0.7636134, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 2.6018002033233643 }, { "auxiliary_loss_clip": 0.01177919, "auxiliary_loss_mlp": 0.00763207, "balance_loss_clip": 1.05718303, "balance_loss_mlp": 1.00020647, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 2.007366788913792, "language_loss": 0.78839707, "learning_rate": 3.651766765265856e-06, "loss": 0.80780828, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 3.3061418533325195 }, { "auxiliary_loss_clip": 0.01178815, "auxiliary_loss_mlp": 0.0103754, "balance_loss_clip": 1.05647111, "balance_loss_mlp": 1.02748501, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 3.1315990973485044, "language_loss": 0.8108604, "learning_rate": 3.65132742386269e-06, "loss": 0.83302391, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.5554301738739014 }, { "auxiliary_loss_clip": 0.01213777, "auxiliary_loss_mlp": 0.01037394, "balance_loss_clip": 1.06201291, "balance_loss_mlp": 1.02859628, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 1.9836481635723249, "language_loss": 0.84704173, "learning_rate": 3.6508878319544656e-06, "loss": 0.86955345, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 3.2349603176116943 }, { "auxiliary_loss_clip": 0.01173601, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.05877256, "balance_loss_mlp": 1.03002381, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 3.398103684844648, "language_loss": 0.8169058, "learning_rate": 3.65044798960787e-06, "loss": 0.83904219, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.5242550373077393 }, { "auxiliary_loss_clip": 0.01159907, "auxiliary_loss_mlp": 0.01027143, "balance_loss_clip": 1.05470443, "balance_loss_mlp": 1.01847684, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 1.930511700220896, "language_loss": 0.77896643, "learning_rate": 3.650007896889627e-06, "loss": 0.80083686, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.597355604171753 }, { "auxiliary_loss_clip": 0.01214436, "auxiliary_loss_mlp": 0.01040378, "balance_loss_clip": 1.06569278, "balance_loss_mlp": 1.03080535, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 1.7411558672954228, "language_loss": 0.80518937, "learning_rate": 3.6495675538664974e-06, "loss": 0.82773757, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.471722364425659 }, { "auxiliary_loss_clip": 0.01182711, "auxiliary_loss_mlp": 0.0103646, "balance_loss_clip": 1.05513537, "balance_loss_mlp": 1.02692318, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 1.7102353773213608, "language_loss": 0.82311577, "learning_rate": 3.649126960605282e-06, "loss": 0.84530753, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.583911657333374 }, { "auxiliary_loss_clip": 0.01179263, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.05741215, "balance_loss_mlp": 1.02339959, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 2.380151079925419, "language_loss": 0.83437598, "learning_rate": 3.6486861171728174e-06, "loss": 0.85649329, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.533357620239258 }, { "auxiliary_loss_clip": 0.01171051, "auxiliary_loss_mlp": 0.01033395, "balance_loss_clip": 1.0550909, "balance_loss_mlp": 1.02423429, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 1.6829178474386945, "language_loss": 0.78576922, "learning_rate": 3.6482450236359803e-06, "loss": 0.80781364, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.637712001800537 }, { "auxiliary_loss_clip": 0.0119605, "auxiliary_loss_mlp": 0.01035885, "balance_loss_clip": 1.06105447, "balance_loss_mlp": 1.02723026, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 2.287342732119108, "language_loss": 0.77358007, "learning_rate": 3.647803680061683e-06, "loss": 0.79589939, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.5509908199310303 }, { "auxiliary_loss_clip": 0.01184254, "auxiliary_loss_mlp": 0.01040004, "balance_loss_clip": 1.05920768, "balance_loss_mlp": 1.02983499, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 3.5241454397463445, "language_loss": 0.74816871, "learning_rate": 3.6473620865168776e-06, "loss": 0.77041125, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.5207464694976807 }, { "auxiliary_loss_clip": 0.01184296, "auxiliary_loss_mlp": 0.01043762, "balance_loss_clip": 1.06224215, "balance_loss_mlp": 1.03470802, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 1.8597321068265413, "language_loss": 0.81620193, "learning_rate": 3.646920243068554e-06, "loss": 0.8384825, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.5166287422180176 }, { "auxiliary_loss_clip": 0.01169509, "auxiliary_loss_mlp": 0.01040001, "balance_loss_clip": 1.05715346, "balance_loss_mlp": 1.02980852, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 1.6074014002718982, "language_loss": 0.74505556, "learning_rate": 3.6464781497837384e-06, "loss": 0.76715064, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.584984064102173 }, { "auxiliary_loss_clip": 0.0118571, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.05641818, "balance_loss_mlp": 1.0295341, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 2.0036948896099993, "language_loss": 0.72636598, "learning_rate": 3.6460358067294965e-06, "loss": 0.74861693, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.610901117324829 }, { "auxiliary_loss_clip": 0.01216437, "auxiliary_loss_mlp": 0.01034996, "balance_loss_clip": 1.06274533, "balance_loss_mlp": 1.02528644, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 2.152115693847454, "language_loss": 0.77449989, "learning_rate": 3.645593213972932e-06, "loss": 0.7970143, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.4989206790924072 }, { "auxiliary_loss_clip": 0.01192817, "auxiliary_loss_mlp": 0.01040117, "balance_loss_clip": 1.05891967, "balance_loss_mlp": 1.03019881, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 2.1722474029640666, "language_loss": 0.79912198, "learning_rate": 3.6451503715811852e-06, "loss": 0.82145131, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.4822099208831787 }, { "auxiliary_loss_clip": 0.01178706, "auxiliary_loss_mlp": 0.01029, "balance_loss_clip": 1.05854595, "balance_loss_mlp": 1.0208106, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 1.9734963162154404, "language_loss": 0.80126309, "learning_rate": 3.6447072796214345e-06, "loss": 0.82334012, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.5004353523254395 }, { "auxiliary_loss_clip": 0.01059468, "auxiliary_loss_mlp": 0.01002194, "balance_loss_clip": 1.02290678, "balance_loss_mlp": 0.99985713, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 0.925219396881176, "language_loss": 0.63134742, "learning_rate": 3.644263938160898e-06, "loss": 0.65196407, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.0824172496795654 }, { "auxiliary_loss_clip": 0.01166429, "auxiliary_loss_mlp": 0.01044619, "balance_loss_clip": 1.05720377, "balance_loss_mlp": 1.0348022, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 1.87844486554684, "language_loss": 0.71842462, "learning_rate": 3.6438203472668293e-06, "loss": 0.74053514, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 2.5857090950012207 }, { "auxiliary_loss_clip": 0.01187638, "auxiliary_loss_mlp": 0.0103725, "balance_loss_clip": 1.06016231, "balance_loss_mlp": 1.02861917, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 1.8965991438675915, "language_loss": 0.81905431, "learning_rate": 3.6433765070065206e-06, "loss": 0.84130323, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.53423810005188 }, { "auxiliary_loss_clip": 0.0121543, "auxiliary_loss_mlp": 0.01037318, "balance_loss_clip": 1.06327033, "balance_loss_mlp": 1.02750111, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 2.88233654437488, "language_loss": 0.87944436, "learning_rate": 3.6429324174473025e-06, "loss": 0.90197182, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.4329371452331543 }, { "auxiliary_loss_clip": 0.01202013, "auxiliary_loss_mlp": 0.01038364, "balance_loss_clip": 1.06155908, "balance_loss_mlp": 1.02889323, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 2.0222607603278147, "language_loss": 0.84500253, "learning_rate": 3.6424880786565425e-06, "loss": 0.86740625, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 3.270334243774414 }, { "auxiliary_loss_clip": 0.01149458, "auxiliary_loss_mlp": 0.01036837, "balance_loss_clip": 1.05595958, "balance_loss_mlp": 1.02665663, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 2.7817188172909275, "language_loss": 0.80076683, "learning_rate": 3.6420434907016482e-06, "loss": 0.82262975, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 2.6436312198638916 }, { "auxiliary_loss_clip": 0.01204455, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.06589079, "balance_loss_mlp": 1.0316937, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 1.8257748931333257, "language_loss": 0.81205678, "learning_rate": 3.6415986536500606e-06, "loss": 0.83450735, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 2.5162250995635986 }, { "auxiliary_loss_clip": 0.01149762, "auxiliary_loss_mlp": 0.01038284, "balance_loss_clip": 1.06046343, "balance_loss_mlp": 1.02965939, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 1.9240441529915235, "language_loss": 0.80348545, "learning_rate": 3.641153567569263e-06, "loss": 0.8253659, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 3.428393602371216 }, { "auxiliary_loss_clip": 0.01194554, "auxiliary_loss_mlp": 0.01034919, "balance_loss_clip": 1.06010473, "balance_loss_mlp": 1.02603793, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 2.1642542344481077, "language_loss": 0.95230532, "learning_rate": 3.640708232526774e-06, "loss": 0.97460008, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 2.5741560459136963 }, { "auxiliary_loss_clip": 0.01136357, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.04870605, "balance_loss_mlp": 1.02670097, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 1.6681753061438978, "language_loss": 0.78634769, "learning_rate": 3.6402626485901504e-06, "loss": 0.80807734, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 2.6998040676116943 }, { "auxiliary_loss_clip": 0.01198494, "auxiliary_loss_mlp": 0.01034868, "balance_loss_clip": 1.06461477, "balance_loss_mlp": 1.0256058, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 2.3349870223848224, "language_loss": 0.77970892, "learning_rate": 3.639816815826988e-06, "loss": 0.8020426, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 3.997248888015747 }, { "auxiliary_loss_clip": 0.01183767, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.05962157, "balance_loss_mlp": 1.02018118, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 1.9326084855370262, "language_loss": 0.77788258, "learning_rate": 3.6393707343049176e-06, "loss": 0.80001283, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 2.5739402770996094 }, { "auxiliary_loss_clip": 0.01201439, "auxiliary_loss_mlp": 0.01035053, "balance_loss_clip": 1.06011176, "balance_loss_mlp": 1.02534318, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 2.3638843607257494, "language_loss": 0.7390669, "learning_rate": 3.6389244040916104e-06, "loss": 0.76143181, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 2.5493388175964355 }, { "auxiliary_loss_clip": 0.01175062, "auxiliary_loss_mlp": 0.00764246, "balance_loss_clip": 1.05801857, "balance_loss_mlp": 1.00029755, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 2.071555003346373, "language_loss": 0.79437768, "learning_rate": 3.6384778252547747e-06, "loss": 0.81377077, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.569887161254883 }, { "auxiliary_loss_clip": 0.01183679, "auxiliary_loss_mlp": 0.0076314, "balance_loss_clip": 1.06510675, "balance_loss_mlp": 1.00028932, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 2.3638684582860456, "language_loss": 0.77852356, "learning_rate": 3.638030997862155e-06, "loss": 0.79799175, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.536895275115967 }, { "auxiliary_loss_clip": 0.01077158, "auxiliary_loss_mlp": 0.01006076, "balance_loss_clip": 1.02488899, "balance_loss_mlp": 1.00366771, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.7666662054381629, "language_loss": 0.59454888, "learning_rate": 3.6375839219815356e-06, "loss": 0.61538118, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.0439674854278564 }, { "auxiliary_loss_clip": 0.01213103, "auxiliary_loss_mlp": 0.01034858, "balance_loss_clip": 1.06146932, "balance_loss_mlp": 1.02485681, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 1.8835672874293465, "language_loss": 0.83094394, "learning_rate": 3.6371365976807375e-06, "loss": 0.85342354, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.5021250247955322 }, { "auxiliary_loss_clip": 0.01146234, "auxiliary_loss_mlp": 0.0104153, "balance_loss_clip": 1.05631769, "balance_loss_mlp": 1.0319519, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 1.6910193979020673, "language_loss": 0.83650446, "learning_rate": 3.6366890250276185e-06, "loss": 0.85838211, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.6365389823913574 }, { "auxiliary_loss_clip": 0.01212246, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.06205201, "balance_loss_mlp": 1.02567315, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 2.565131486087501, "language_loss": 0.89789653, "learning_rate": 3.6362412040900764e-06, "loss": 0.92036748, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.525851249694824 }, { "auxiliary_loss_clip": 0.01202778, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.0627079, "balance_loss_mlp": 1.02528775, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 1.8773757094131676, "language_loss": 0.80199826, "learning_rate": 3.635793134936044e-06, "loss": 0.82437575, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.6082136631011963 }, { "auxiliary_loss_clip": 0.0119634, "auxiliary_loss_mlp": 0.01038282, "balance_loss_clip": 1.06052113, "balance_loss_mlp": 1.02902579, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 1.7518697399448588, "language_loss": 0.73258996, "learning_rate": 3.635344817633494e-06, "loss": 0.75493622, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.524298906326294 }, { "auxiliary_loss_clip": 0.01192391, "auxiliary_loss_mlp": 0.01036855, "balance_loss_clip": 1.05901122, "balance_loss_mlp": 1.02786064, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 4.1226091510224565, "language_loss": 0.75535458, "learning_rate": 3.634896252250436e-06, "loss": 0.77764696, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.483999490737915 }, { "auxiliary_loss_clip": 0.01219369, "auxiliary_loss_mlp": 0.01039355, "balance_loss_clip": 1.06670177, "balance_loss_mlp": 1.02895391, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 1.9221677693658599, "language_loss": 0.82259226, "learning_rate": 3.6344474388549157e-06, "loss": 0.84517944, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.502225399017334 }, { "auxiliary_loss_clip": 0.01199719, "auxiliary_loss_mlp": 0.01034937, "balance_loss_clip": 1.06288934, "balance_loss_mlp": 1.02457225, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 1.9736827601586437, "language_loss": 0.80324185, "learning_rate": 3.6339983775150183e-06, "loss": 0.82558846, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.4580917358398438 }, { "auxiliary_loss_clip": 0.01198992, "auxiliary_loss_mlp": 0.01040708, "balance_loss_clip": 1.06317544, "balance_loss_mlp": 1.0311234, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 3.2211218875311904, "language_loss": 0.84172028, "learning_rate": 3.6335490682988664e-06, "loss": 0.86411732, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.4820921421051025 }, { "auxiliary_loss_clip": 0.01131718, "auxiliary_loss_mlp": 0.01037307, "balance_loss_clip": 1.05155694, "balance_loss_mlp": 1.02793717, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 30.646845898126266, "language_loss": 0.82763278, "learning_rate": 3.63309951127462e-06, "loss": 0.84932303, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.6118664741516113 }, { "auxiliary_loss_clip": 0.01168673, "auxiliary_loss_mlp": 0.01042043, "balance_loss_clip": 1.05890727, "balance_loss_mlp": 1.03266096, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 2.6069377117147385, "language_loss": 0.7554816, "learning_rate": 3.6326497065104757e-06, "loss": 0.77758873, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.622213125228882 }, { "auxiliary_loss_clip": 0.01203808, "auxiliary_loss_mlp": 0.01042536, "balance_loss_clip": 1.06184101, "balance_loss_mlp": 1.0328567, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 2.0667166426303325, "language_loss": 0.78511298, "learning_rate": 3.6321996540746697e-06, "loss": 0.80757648, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 2.556617021560669 }, { "auxiliary_loss_clip": 0.01168508, "auxiliary_loss_mlp": 0.01040857, "balance_loss_clip": 1.05662847, "balance_loss_mlp": 1.03109348, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 1.7858488942817552, "language_loss": 0.80867457, "learning_rate": 3.6317493540354733e-06, "loss": 0.83076823, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.720679759979248 }, { "auxiliary_loss_clip": 0.01192827, "auxiliary_loss_mlp": 0.01033484, "balance_loss_clip": 1.05885386, "balance_loss_mlp": 1.02407241, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 2.010552468999786, "language_loss": 0.77242768, "learning_rate": 3.6312988064611976e-06, "loss": 0.79469073, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 3.3244943618774414 }, { "auxiliary_loss_clip": 0.01169492, "auxiliary_loss_mlp": 0.01036735, "balance_loss_clip": 1.05367279, "balance_loss_mlp": 1.02720439, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 1.8952734571863783, "language_loss": 0.81451273, "learning_rate": 3.6308480114201896e-06, "loss": 0.83657503, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 2.6163628101348877 }, { "auxiliary_loss_clip": 0.01216903, "auxiliary_loss_mlp": 0.01040161, "balance_loss_clip": 1.06598461, "balance_loss_mlp": 1.03067791, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 1.849449296883682, "language_loss": 0.76427948, "learning_rate": 3.630396968980835e-06, "loss": 0.78685009, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 2.4813525676727295 }, { "auxiliary_loss_clip": 0.01183669, "auxiliary_loss_mlp": 0.01045498, "balance_loss_clip": 1.05789924, "balance_loss_mlp": 1.03519845, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 2.4955239342143027, "language_loss": 0.83745366, "learning_rate": 3.6299456792115575e-06, "loss": 0.85974532, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 3.374748706817627 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01031632, "balance_loss_clip": 1.0460099, "balance_loss_mlp": 1.02253652, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 1.7741231884670803, "language_loss": 0.81190169, "learning_rate": 3.629494142180815e-06, "loss": 0.8332721, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 2.706540822982788 }, { "auxiliary_loss_clip": 0.01213527, "auxiliary_loss_mlp": 0.01034397, "balance_loss_clip": 1.0634129, "balance_loss_mlp": 1.02518821, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.0576280325168104, "language_loss": 0.85602421, "learning_rate": 3.6290423579571075e-06, "loss": 0.8785035, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 2.6072866916656494 }, { "auxiliary_loss_clip": 0.01196487, "auxiliary_loss_mlp": 0.01040481, "balance_loss_clip": 1.06289268, "balance_loss_mlp": 1.03109908, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 1.6387086328042546, "language_loss": 0.80326235, "learning_rate": 3.6285903266089694e-06, "loss": 0.82563198, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 2.4984676837921143 }, { "auxiliary_loss_clip": 0.01179879, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 1.05618691, "balance_loss_mlp": 1.01906586, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 1.694509582211568, "language_loss": 0.77017617, "learning_rate": 3.628138048204974e-06, "loss": 0.79225791, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 3.245981454849243 }, { "auxiliary_loss_clip": 0.01145619, "auxiliary_loss_mlp": 0.01040249, "balance_loss_clip": 1.05595756, "balance_loss_mlp": 1.02960408, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 2.093394560028967, "language_loss": 0.76006997, "learning_rate": 3.6276855228137304e-06, "loss": 0.7819286, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 3.3193488121032715 }, { "auxiliary_loss_clip": 0.01214137, "auxiliary_loss_mlp": 0.00763406, "balance_loss_clip": 1.06324983, "balance_loss_mlp": 1.00035822, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.0510147220362542, "language_loss": 0.81875038, "learning_rate": 3.6272327505038874e-06, "loss": 0.83852577, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 2.492896556854248 }, { "auxiliary_loss_clip": 0.01158678, "auxiliary_loss_mlp": 0.0103154, "balance_loss_clip": 1.05606616, "balance_loss_mlp": 1.02280235, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 2.7237161240798917, "language_loss": 0.78319204, "learning_rate": 3.626779731344131e-06, "loss": 0.80509424, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.615344762802124 }, { "auxiliary_loss_clip": 0.01209689, "auxiliary_loss_mlp": 0.01029382, "balance_loss_clip": 1.0602349, "balance_loss_mlp": 1.01994014, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 2.4405884354401666, "language_loss": 0.85374701, "learning_rate": 3.6263264654031814e-06, "loss": 0.87613773, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.4658052921295166 }, { "auxiliary_loss_clip": 0.01071997, "auxiliary_loss_mlp": 0.0100367, "balance_loss_clip": 1.02913725, "balance_loss_mlp": 1.00122583, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.6965947223285422, "language_loss": 0.59226727, "learning_rate": 3.6258729527498008e-06, "loss": 0.61302394, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.133504629135132 }, { "auxiliary_loss_clip": 0.01188678, "auxiliary_loss_mlp": 0.01035496, "balance_loss_clip": 1.06099463, "balance_loss_mlp": 1.02583385, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 2.5171288989414684, "language_loss": 0.65436262, "learning_rate": 3.6254191934527854e-06, "loss": 0.67660439, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.594733476638794 }, { "auxiliary_loss_clip": 0.01168103, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.06054449, "balance_loss_mlp": 1.02395761, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 2.350690865316085, "language_loss": 0.65021193, "learning_rate": 3.6249651875809715e-06, "loss": 0.67223674, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.5676612854003906 }, { "auxiliary_loss_clip": 0.01182358, "auxiliary_loss_mlp": 0.01037923, "balance_loss_clip": 1.06233251, "balance_loss_mlp": 1.02767682, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 1.830306359786449, "language_loss": 0.89280736, "learning_rate": 3.62451093520323e-06, "loss": 0.91501009, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.5562593936920166 }, { "auxiliary_loss_clip": 0.01148038, "auxiliary_loss_mlp": 0.0103471, "balance_loss_clip": 1.05095196, "balance_loss_mlp": 1.02532244, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 1.9163221656370755, "language_loss": 0.90741706, "learning_rate": 3.6240564363884714e-06, "loss": 0.92924452, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.6089136600494385 }, { "auxiliary_loss_clip": 0.01199404, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.05825996, "balance_loss_mlp": 1.0242945, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 2.1256972724704863, "language_loss": 0.7026329, "learning_rate": 3.623601691205643e-06, "loss": 0.72496784, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.521548271179199 }, { "auxiliary_loss_clip": 0.01197021, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.06029177, "balance_loss_mlp": 1.023579, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 2.483894779579382, "language_loss": 0.82093096, "learning_rate": 3.623146699723729e-06, "loss": 0.84322906, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.550711154937744 }, { "auxiliary_loss_clip": 0.01186689, "auxiliary_loss_mlp": 0.01036369, "balance_loss_clip": 1.06509554, "balance_loss_mlp": 1.02679062, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 1.8539121793025382, "language_loss": 0.7756933, "learning_rate": 3.6226914620117507e-06, "loss": 0.79792386, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.5188686847686768 }, { "auxiliary_loss_clip": 0.01165419, "auxiliary_loss_mlp": 0.01033854, "balance_loss_clip": 1.05035377, "balance_loss_mlp": 1.02434659, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 1.9801601468344223, "language_loss": 0.80732369, "learning_rate": 3.622235978138768e-06, "loss": 0.82931644, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.5568552017211914 }, { "auxiliary_loss_clip": 0.01198843, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.0634228, "balance_loss_mlp": 1.02393627, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 2.4776494870249746, "language_loss": 0.81541133, "learning_rate": 3.621780248173877e-06, "loss": 0.83773196, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.530457019805908 }, { "auxiliary_loss_clip": 0.01095731, "auxiliary_loss_mlp": 0.01006594, "balance_loss_clip": 1.0273639, "balance_loss_mlp": 1.00416243, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.8291200037751248, "language_loss": 0.61084032, "learning_rate": 3.6213242721862125e-06, "loss": 0.63186353, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.130747079849243 }, { "auxiliary_loss_clip": 0.01175039, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.06037247, "balance_loss_mlp": 1.03040934, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.874202357832368, "language_loss": 0.75434506, "learning_rate": 3.620868050244945e-06, "loss": 0.77648914, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.582007646560669 }, { "auxiliary_loss_clip": 0.011778, "auxiliary_loss_mlp": 0.01045728, "balance_loss_clip": 1.05853522, "balance_loss_mlp": 1.03517795, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 1.7235808135200084, "language_loss": 0.7762146, "learning_rate": 3.6204115824192817e-06, "loss": 0.79844987, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 2.547947883605957 }, { "auxiliary_loss_clip": 0.01172614, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.05484819, "balance_loss_mlp": 1.02386642, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 2.523170148450079, "language_loss": 0.77555555, "learning_rate": 3.619954868778471e-06, "loss": 0.79762101, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 3.2807791233062744 }, { "auxiliary_loss_clip": 0.01182834, "auxiliary_loss_mlp": 0.01041229, "balance_loss_clip": 1.0572679, "balance_loss_mlp": 1.03146529, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 1.9687584915419636, "language_loss": 0.83095855, "learning_rate": 3.6194979093917944e-06, "loss": 0.85319918, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.5474274158477783 }, { "auxiliary_loss_clip": 0.01176393, "auxiliary_loss_mlp": 0.01047797, "balance_loss_clip": 1.05711746, "balance_loss_mlp": 1.03800404, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 1.8125389370462268, "language_loss": 0.87096012, "learning_rate": 3.6190407043285724e-06, "loss": 0.89320207, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 2.562239170074463 }, { "auxiliary_loss_clip": 0.01211864, "auxiliary_loss_mlp": 0.0104377, "balance_loss_clip": 1.06124127, "balance_loss_mlp": 1.03345847, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 1.8856752592588955, "language_loss": 0.75761425, "learning_rate": 3.618583253658163e-06, "loss": 0.78017056, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.5362656116485596 }, { "auxiliary_loss_clip": 0.01154768, "auxiliary_loss_mlp": 0.00763483, "balance_loss_clip": 1.05759501, "balance_loss_mlp": 1.00039923, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 1.796451844300776, "language_loss": 0.86452794, "learning_rate": 3.618125557449961e-06, "loss": 0.8837105, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 3.4278626441955566 }, { "auxiliary_loss_clip": 0.01197144, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.06355286, "balance_loss_mlp": 1.02499628, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 1.8411295110707282, "language_loss": 0.8285675, "learning_rate": 3.6176676157733983e-06, "loss": 0.85088086, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 2.4845263957977295 }, { "auxiliary_loss_clip": 0.01160131, "auxiliary_loss_mlp": 0.01044877, "balance_loss_clip": 1.05354643, "balance_loss_mlp": 1.03464842, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 2.1062668448182222, "language_loss": 0.75494969, "learning_rate": 3.6172094286979443e-06, "loss": 0.77699977, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 2.5725181102752686 }, { "auxiliary_loss_clip": 0.01180036, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.05538738, "balance_loss_mlp": 1.01862001, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.8129473885452336, "language_loss": 0.81321126, "learning_rate": 3.6167509962931064e-06, "loss": 0.83529216, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 2.6856613159179688 }, { "auxiliary_loss_clip": 0.01162677, "auxiliary_loss_mlp": 0.0104127, "balance_loss_clip": 1.05902338, "balance_loss_mlp": 1.03151894, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 2.9501471794501186, "language_loss": 0.77164179, "learning_rate": 3.6162923186284276e-06, "loss": 0.79368126, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 3.303394079208374 }, { "auxiliary_loss_clip": 0.01180947, "auxiliary_loss_mlp": 0.01038703, "balance_loss_clip": 1.05761194, "balance_loss_mlp": 1.02955937, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 2.1785657643537113, "language_loss": 0.85826862, "learning_rate": 3.6158333957734888e-06, "loss": 0.88046515, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 3.297990322113037 }, { "auxiliary_loss_clip": 0.01171434, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.05607045, "balance_loss_mlp": 1.02597797, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 2.109386333761803, "language_loss": 0.82767391, "learning_rate": 3.6153742277979088e-06, "loss": 0.84973395, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.5543501377105713 }, { "auxiliary_loss_clip": 0.01183716, "auxiliary_loss_mlp": 0.01041938, "balance_loss_clip": 1.05921221, "balance_loss_mlp": 1.03207898, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 2.4162409343010993, "language_loss": 0.78191817, "learning_rate": 3.6149148147713434e-06, "loss": 0.80417466, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.5225110054016113 }, { "auxiliary_loss_clip": 0.01205775, "auxiliary_loss_mlp": 0.01039485, "balance_loss_clip": 1.06619167, "balance_loss_mlp": 1.02989435, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 1.8313310016783932, "language_loss": 0.86329424, "learning_rate": 3.614455156763484e-06, "loss": 0.88574684, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.5161828994750977 }, { "auxiliary_loss_clip": 0.0114912, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.05112898, "balance_loss_mlp": 1.02375901, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 2.6437667958172617, "language_loss": 0.71468502, "learning_rate": 3.613995253844061e-06, "loss": 0.73650956, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.5896646976470947 }, { "auxiliary_loss_clip": 0.01192921, "auxiliary_loss_mlp": 0.01029331, "balance_loss_clip": 1.06008852, "balance_loss_mlp": 1.02030087, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 1.9038835185068268, "language_loss": 0.80674583, "learning_rate": 3.6135351060828414e-06, "loss": 0.82896835, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.551215887069702 }, { "auxiliary_loss_clip": 0.01214386, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.06327736, "balance_loss_mlp": 1.02124047, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 2.041245172053233, "language_loss": 0.69323742, "learning_rate": 3.6130747135496285e-06, "loss": 0.71569443, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.472790241241455 }, { "auxiliary_loss_clip": 0.01210305, "auxiliary_loss_mlp": 0.01037879, "balance_loss_clip": 1.06120753, "balance_loss_mlp": 1.02793658, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 1.90371822530015, "language_loss": 0.65816116, "learning_rate": 3.6126140763142646e-06, "loss": 0.68064302, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.589491605758667 }, { "auxiliary_loss_clip": 0.01211762, "auxiliary_loss_mlp": 0.01036997, "balance_loss_clip": 1.06267023, "balance_loss_mlp": 1.02692413, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 2.8709569794141694, "language_loss": 0.8641867, "learning_rate": 3.6121531944466275e-06, "loss": 0.88667434, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.472351312637329 }, { "auxiliary_loss_clip": 0.01196743, "auxiliary_loss_mlp": 0.01037134, "balance_loss_clip": 1.06118059, "balance_loss_mlp": 1.02831292, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 2.560105114809103, "language_loss": 0.78017592, "learning_rate": 3.611692068016633e-06, "loss": 0.80251467, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.503307342529297 }, { "auxiliary_loss_clip": 0.01162645, "auxiliary_loss_mlp": 0.01040152, "balance_loss_clip": 1.05454421, "balance_loss_mlp": 1.03038847, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 1.9942141732131287, "language_loss": 0.74857259, "learning_rate": 3.611230697094233e-06, "loss": 0.77060056, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.540508508682251 }, { "auxiliary_loss_clip": 0.01182474, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.05579758, "balance_loss_mlp": 1.02827168, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 1.887005556463023, "language_loss": 0.87388211, "learning_rate": 3.6107690817494173e-06, "loss": 0.89608371, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.54695463180542 }, { "auxiliary_loss_clip": 0.01145533, "auxiliary_loss_mlp": 0.01035844, "balance_loss_clip": 1.05218971, "balance_loss_mlp": 1.02654564, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 2.171346060298478, "language_loss": 0.70407248, "learning_rate": 3.6103072220522117e-06, "loss": 0.72588629, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.5786023139953613 }, { "auxiliary_loss_clip": 0.01166809, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 1.05485153, "balance_loss_mlp": 1.03303885, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 1.7244562508886796, "language_loss": 0.9192915, "learning_rate": 3.609845118072682e-06, "loss": 0.94137591, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.572270393371582 }, { "auxiliary_loss_clip": 0.01201818, "auxiliary_loss_mlp": 0.00763669, "balance_loss_clip": 1.06059468, "balance_loss_mlp": 1.00038862, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 1.6692317203724176, "language_loss": 0.80261701, "learning_rate": 3.6093827698809276e-06, "loss": 0.82227188, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.5336966514587402 }, { "auxiliary_loss_clip": 0.0119022, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.05446768, "balance_loss_mlp": 1.01778483, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 2.4903815238412763, "language_loss": 0.84766835, "learning_rate": 3.6089201775470864e-06, "loss": 0.86983562, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.4835305213928223 }, { "auxiliary_loss_clip": 0.01154935, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.05408907, "balance_loss_mlp": 1.02467132, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 1.5347569096061404, "language_loss": 0.77418613, "learning_rate": 3.6084573411413334e-06, "loss": 0.79607236, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.632819414138794 }, { "auxiliary_loss_clip": 0.01166078, "auxiliary_loss_mlp": 0.01033809, "balance_loss_clip": 1.05730677, "balance_loss_mlp": 1.02471972, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 2.0933227833626766, "language_loss": 0.80962884, "learning_rate": 3.607994260733881e-06, "loss": 0.83162773, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 3.347014904022217 }, { "auxiliary_loss_clip": 0.01181819, "auxiliary_loss_mlp": 0.0103084, "balance_loss_clip": 1.05415034, "balance_loss_mlp": 1.02260828, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 1.6331163052628141, "language_loss": 0.74705458, "learning_rate": 3.6075309363949776e-06, "loss": 0.76918113, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.5312106609344482 }, { "auxiliary_loss_clip": 0.01208325, "auxiliary_loss_mlp": 0.01030971, "balance_loss_clip": 1.05939555, "balance_loss_mlp": 1.02145195, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 1.886772979988569, "language_loss": 0.81155455, "learning_rate": 3.6070673681949094e-06, "loss": 0.83394754, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 2.505464553833008 }, { "auxiliary_loss_clip": 0.01182526, "auxiliary_loss_mlp": 0.00762529, "balance_loss_clip": 1.0588901, "balance_loss_mlp": 1.00036085, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.6529379436572094, "language_loss": 0.81408787, "learning_rate": 3.606603556203999e-06, "loss": 0.83353841, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.6385016441345215 }, { "auxiliary_loss_clip": 0.01194917, "auxiliary_loss_mlp": 0.0104001, "balance_loss_clip": 1.05822968, "balance_loss_mlp": 1.03103924, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 2.1989570165227565, "language_loss": 0.83658361, "learning_rate": 3.6061395004926066e-06, "loss": 0.85893285, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 2.522618055343628 }, { "auxiliary_loss_clip": 0.01207661, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.05987263, "balance_loss_mlp": 1.02605116, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 2.848856048074611, "language_loss": 0.85260934, "learning_rate": 3.605675201131129e-06, "loss": 0.87504089, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 3.2511303424835205 }, { "auxiliary_loss_clip": 0.01197346, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.05915046, "balance_loss_mlp": 1.02300835, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 2.197995193907923, "language_loss": 0.79822981, "learning_rate": 3.60521065819e-06, "loss": 0.82052791, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 2.4789669513702393 }, { "auxiliary_loss_clip": 0.01178226, "auxiliary_loss_mlp": 0.01030974, "balance_loss_clip": 1.05385184, "balance_loss_mlp": 1.02254057, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 1.9135752120304614, "language_loss": 0.87378722, "learning_rate": 3.60474587173969e-06, "loss": 0.89587921, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 2.5518712997436523 }, { "auxiliary_loss_clip": 0.01195431, "auxiliary_loss_mlp": 0.01031073, "balance_loss_clip": 1.06405771, "balance_loss_mlp": 1.02144659, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 1.880898699091292, "language_loss": 0.84186351, "learning_rate": 3.6042808418507084e-06, "loss": 0.86412859, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 3.2313616275787354 }, { "auxiliary_loss_clip": 0.01196099, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.06240106, "balance_loss_mlp": 1.02647877, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 2.4771437259053872, "language_loss": 0.76713437, "learning_rate": 3.6038155685935976e-06, "loss": 0.78945547, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 3.2739810943603516 }, { "auxiliary_loss_clip": 0.01191352, "auxiliary_loss_mlp": 0.01036591, "balance_loss_clip": 1.05800605, "balance_loss_mlp": 1.02672005, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 2.1053670696656934, "language_loss": 0.70590806, "learning_rate": 3.6033500520389404e-06, "loss": 0.72818756, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.5142710208892822 }, { "auxiliary_loss_clip": 0.01059769, "auxiliary_loss_mlp": 0.01002334, "balance_loss_clip": 1.0213356, "balance_loss_mlp": 1.00009286, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.7882888454372766, "language_loss": 0.6479944, "learning_rate": 3.6028842922573553e-06, "loss": 0.66861546, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.270555257797241 }, { "auxiliary_loss_clip": 0.01073782, "auxiliary_loss_mlp": 0.00753501, "balance_loss_clip": 1.02457738, "balance_loss_mlp": 1.00010252, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8546828944597508, "language_loss": 0.62969607, "learning_rate": 3.602418289319497e-06, "loss": 0.64796889, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.1438465118408203 }, { "auxiliary_loss_clip": 0.01148279, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.0532012, "balance_loss_mlp": 1.02523744, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 2.5057844390170083, "language_loss": 0.72962904, "learning_rate": 3.601952043296059e-06, "loss": 0.75145674, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.612128734588623 }, { "auxiliary_loss_clip": 0.01184196, "auxiliary_loss_mlp": 0.01031288, "balance_loss_clip": 1.05551958, "balance_loss_mlp": 1.02153707, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.104706619519775, "language_loss": 0.80711555, "learning_rate": 3.6014855542577696e-06, "loss": 0.82927042, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.554781913757324 }, { "auxiliary_loss_clip": 0.01180515, "auxiliary_loss_mlp": 0.01041142, "balance_loss_clip": 1.05823493, "balance_loss_mlp": 1.03196335, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 1.7038358297561893, "language_loss": 0.84422112, "learning_rate": 3.6010188222753943e-06, "loss": 0.86643767, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.581376791000366 }, { "auxiliary_loss_clip": 0.01080726, "auxiliary_loss_mlp": 0.01002959, "balance_loss_clip": 1.02542686, "balance_loss_mlp": 1.00077713, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.9202511169432144, "language_loss": 0.64238048, "learning_rate": 3.6005518474197372e-06, "loss": 0.66321731, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.0441086292266846 }, { "auxiliary_loss_clip": 0.01194653, "auxiliary_loss_mlp": 0.01044283, "balance_loss_clip": 1.06113112, "balance_loss_mlp": 1.03403091, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 2.248873604252607, "language_loss": 0.78329653, "learning_rate": 3.6000846297616373e-06, "loss": 0.80568588, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.5509328842163086 }, { "auxiliary_loss_clip": 0.01217675, "auxiliary_loss_mlp": 0.01036817, "balance_loss_clip": 1.0676347, "balance_loss_mlp": 1.02681541, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 2.0874817811671353, "language_loss": 0.72710383, "learning_rate": 3.5996171693719717e-06, "loss": 0.74964869, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.468595266342163 }, { "auxiliary_loss_clip": 0.01091794, "auxiliary_loss_mlp": 0.01007006, "balance_loss_clip": 1.02415943, "balance_loss_mlp": 1.00478864, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.9343989992060507, "language_loss": 0.64823055, "learning_rate": 3.5991494663216528e-06, "loss": 0.66921854, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.1764278411865234 }, { "auxiliary_loss_clip": 0.01211573, "auxiliary_loss_mlp": 0.01035965, "balance_loss_clip": 1.06358171, "balance_loss_mlp": 1.02651167, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 2.334156799380791, "language_loss": 0.87629145, "learning_rate": 3.5986815206816314e-06, "loss": 0.89876676, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.496854066848755 }, { "auxiliary_loss_clip": 0.01207554, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.05968499, "balance_loss_mlp": 1.02837217, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 1.7741217724551384, "language_loss": 0.74945891, "learning_rate": 3.598213332522895e-06, "loss": 0.77190721, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.544039249420166 }, { "auxiliary_loss_clip": 0.01192404, "auxiliary_loss_mlp": 0.01039873, "balance_loss_clip": 1.0583787, "balance_loss_mlp": 1.03071129, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 2.49109483612779, "language_loss": 0.77307087, "learning_rate": 3.597744901916466e-06, "loss": 0.79539359, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.5663156509399414 }, { "auxiliary_loss_clip": 0.01210178, "auxiliary_loss_mlp": 0.0103741, "balance_loss_clip": 1.05919099, "balance_loss_mlp": 1.02728868, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 2.091262418671253, "language_loss": 0.76855582, "learning_rate": 3.5972762289334058e-06, "loss": 0.79103172, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.4929656982421875 }, { "auxiliary_loss_clip": 0.0113097, "auxiliary_loss_mlp": 0.01041462, "balance_loss_clip": 1.05308437, "balance_loss_mlp": 1.03180635, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 1.9418205793081609, "language_loss": 0.84767354, "learning_rate": 3.5968073136448116e-06, "loss": 0.86939788, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.627415418624878 }, { "auxiliary_loss_clip": 0.01199274, "auxiliary_loss_mlp": 0.01032546, "balance_loss_clip": 1.06075549, "balance_loss_mlp": 1.02282417, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 2.1397466064419617, "language_loss": 0.9139207, "learning_rate": 3.596338156121818e-06, "loss": 0.93623888, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.4904751777648926 }, { "auxiliary_loss_clip": 0.01079282, "auxiliary_loss_mlp": 0.01002623, "balance_loss_clip": 1.02356744, "balance_loss_mlp": 1.00029814, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.762638861326994, "language_loss": 0.59357977, "learning_rate": 3.595868756435595e-06, "loss": 0.61439878, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 4.008214235305786 }, { "auxiliary_loss_clip": 0.01173355, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.06313705, "balance_loss_mlp": 1.02906632, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.19409478454432, "language_loss": 0.806615, "learning_rate": 3.5953991146573504e-06, "loss": 0.82873464, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.561598300933838 }, { "auxiliary_loss_clip": 0.01196481, "auxiliary_loss_mlp": 0.01040686, "balance_loss_clip": 1.05758238, "balance_loss_mlp": 1.03052902, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 2.2597784286697546, "language_loss": 0.83029336, "learning_rate": 3.5949292308583294e-06, "loss": 0.85266507, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 2.4531362056732178 }, { "auxiliary_loss_clip": 0.01211718, "auxiliary_loss_mlp": 0.01035532, "balance_loss_clip": 1.06396151, "balance_loss_mlp": 1.02577496, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.1135003960424195, "language_loss": 0.80764192, "learning_rate": 3.594459105109811e-06, "loss": 0.83011436, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.496546983718872 }, { "auxiliary_loss_clip": 0.01199574, "auxiliary_loss_mlp": 0.01036196, "balance_loss_clip": 1.06190777, "balance_loss_mlp": 1.02719593, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 1.7163182533266421, "language_loss": 0.81475717, "learning_rate": 3.593988737483115e-06, "loss": 0.83711481, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 2.499314308166504 }, { "auxiliary_loss_clip": 0.01184553, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.06150448, "balance_loss_mlp": 1.02909422, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 1.996662778407666, "language_loss": 0.78339398, "learning_rate": 3.5935181280495947e-06, "loss": 0.80562532, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 3.2908010482788086 }, { "auxiliary_loss_clip": 0.01078964, "auxiliary_loss_mlp": 0.01004563, "balance_loss_clip": 1.02784026, "balance_loss_mlp": 1.00224996, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 0.8106259318431462, "language_loss": 0.54262149, "learning_rate": 3.5930472768806412e-06, "loss": 0.56345677, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 3.1123931407928467 }, { "auxiliary_loss_clip": 0.0121057, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.0630039, "balance_loss_mlp": 1.02821565, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 1.8356560479041255, "language_loss": 0.77195567, "learning_rate": 3.5925761840476826e-06, "loss": 0.79443467, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 2.486764669418335 }, { "auxiliary_loss_clip": 0.01179679, "auxiliary_loss_mlp": 0.01034199, "balance_loss_clip": 1.06335425, "balance_loss_mlp": 1.02507997, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 1.8909966333300505, "language_loss": 0.80991161, "learning_rate": 3.592104849622183e-06, "loss": 0.83205032, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 3.27776837348938 }, { "auxiliary_loss_clip": 0.01142455, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.05472791, "balance_loss_mlp": 1.0178206, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 1.460238265629872, "language_loss": 0.73251581, "learning_rate": 3.591633273675644e-06, "loss": 0.75421298, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 3.42789888381958 }, { "auxiliary_loss_clip": 0.01061429, "auxiliary_loss_mlp": 0.01002131, "balance_loss_clip": 1.02862501, "balance_loss_mlp": 0.99994987, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.9280028185681791, "language_loss": 0.58187962, "learning_rate": 3.591161456279602e-06, "loss": 0.60251528, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 2.9824328422546387 }, { "auxiliary_loss_clip": 0.01187697, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.05890298, "balance_loss_mlp": 1.02608323, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 1.4965230820115563, "language_loss": 0.80543268, "learning_rate": 3.590689397505633e-06, "loss": 0.82766795, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.582834243774414 }, { "auxiliary_loss_clip": 0.01206983, "auxiliary_loss_mlp": 0.01038075, "balance_loss_clip": 1.06150413, "balance_loss_mlp": 1.02942085, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 1.9673967286647924, "language_loss": 0.86816901, "learning_rate": 3.590217097425347e-06, "loss": 0.89061958, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.5602033138275146 }, { "auxiliary_loss_clip": 0.01211423, "auxiliary_loss_mlp": 0.01035849, "balance_loss_clip": 1.0622586, "balance_loss_mlp": 1.02616942, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 2.0223802588773228, "language_loss": 0.7117269, "learning_rate": 3.589744556110391e-06, "loss": 0.73419964, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.4694690704345703 }, { "auxiliary_loss_clip": 0.01178777, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.05844867, "balance_loss_mlp": 1.03084612, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 1.7101525904267878, "language_loss": 0.84407687, "learning_rate": 3.58927177363245e-06, "loss": 0.86627847, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.691822052001953 }, { "auxiliary_loss_clip": 0.01161829, "auxiliary_loss_mlp": 0.01037412, "balance_loss_clip": 1.05437136, "balance_loss_mlp": 1.02656949, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 2.042121981488652, "language_loss": 0.72524631, "learning_rate": 3.5887987500632447e-06, "loss": 0.74723876, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.600806951522827 }, { "auxiliary_loss_clip": 0.0117458, "auxiliary_loss_mlp": 0.01035676, "balance_loss_clip": 1.0590018, "balance_loss_mlp": 1.02722359, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 1.7192930999374594, "language_loss": 0.84200466, "learning_rate": 3.5883254854745325e-06, "loss": 0.86410713, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.5778889656066895 }, { "auxiliary_loss_clip": 0.01200773, "auxiliary_loss_mlp": 0.01030972, "balance_loss_clip": 1.06021976, "balance_loss_mlp": 1.02112532, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 1.980152521474129, "language_loss": 0.74838573, "learning_rate": 3.587851979938107e-06, "loss": 0.7707032, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.4899919033050537 }, { "auxiliary_loss_clip": 0.01195319, "auxiliary_loss_mlp": 0.01043093, "balance_loss_clip": 1.06158829, "balance_loss_mlp": 1.03388977, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 2.15368063128977, "language_loss": 0.77221417, "learning_rate": 3.5873782335257985e-06, "loss": 0.79459834, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.4979662895202637 }, { "auxiliary_loss_clip": 0.01165972, "auxiliary_loss_mlp": 0.01042276, "balance_loss_clip": 1.05931294, "balance_loss_mlp": 1.03238177, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 2.2882219996758675, "language_loss": 0.78638667, "learning_rate": 3.5869042463094744e-06, "loss": 0.80846918, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.5520195960998535 }, { "auxiliary_loss_clip": 0.0113308, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.0513823, "balance_loss_mlp": 1.03228736, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 1.934915508582788, "language_loss": 0.77237588, "learning_rate": 3.586430018361038e-06, "loss": 0.79412866, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.607043981552124 }, { "auxiliary_loss_clip": 0.01165191, "auxiliary_loss_mlp": 0.01036153, "balance_loss_clip": 1.05443096, "balance_loss_mlp": 1.02593708, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 2.3843396234048755, "language_loss": 0.76468837, "learning_rate": 3.5859555497524283e-06, "loss": 0.78670192, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.5569307804107666 }, { "auxiliary_loss_clip": 0.01200543, "auxiliary_loss_mlp": 0.01041051, "balance_loss_clip": 1.06632686, "balance_loss_mlp": 1.03157997, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 2.0212908925038517, "language_loss": 0.92140174, "learning_rate": 3.5854808405556237e-06, "loss": 0.94381762, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.5480642318725586 }, { "auxiliary_loss_clip": 0.01167925, "auxiliary_loss_mlp": 0.0103645, "balance_loss_clip": 1.05726433, "balance_loss_mlp": 1.02705622, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 2.274561139925823, "language_loss": 0.75617975, "learning_rate": 3.5850058908426355e-06, "loss": 0.77822351, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.541945695877075 }, { "auxiliary_loss_clip": 0.01183573, "auxiliary_loss_mlp": 0.01035274, "balance_loss_clip": 1.05652523, "balance_loss_mlp": 1.02622652, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 1.8009652748784892, "language_loss": 0.85681021, "learning_rate": 3.584530700685514e-06, "loss": 0.87899864, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.5566775798797607 }, { "auxiliary_loss_clip": 0.01178952, "auxiliary_loss_mlp": 0.0103645, "balance_loss_clip": 1.06225753, "balance_loss_mlp": 1.02758741, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 1.9080419084612432, "language_loss": 0.88682699, "learning_rate": 3.5840552701563448e-06, "loss": 0.90898108, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.5391664505004883 }, { "auxiliary_loss_clip": 0.01210769, "auxiliary_loss_mlp": 0.0104034, "balance_loss_clip": 1.06272459, "balance_loss_mlp": 1.03049958, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.4758434551127393, "language_loss": 0.81664717, "learning_rate": 3.5835795993272513e-06, "loss": 0.8391583, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 3.1778643131256104 }, { "auxiliary_loss_clip": 0.0109946, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.04839802, "balance_loss_mlp": 1.02793956, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 2.4643756857755426, "language_loss": 0.71002418, "learning_rate": 3.583103688270391e-06, "loss": 0.73138893, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 2.7549662590026855 }, { "auxiliary_loss_clip": 0.01166826, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.05527496, "balance_loss_mlp": 1.02428102, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 2.093965644250405, "language_loss": 0.89367807, "learning_rate": 3.58262753705796e-06, "loss": 0.91568738, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 2.7021915912628174 }, { "auxiliary_loss_clip": 0.01076783, "auxiliary_loss_mlp": 0.01007671, "balance_loss_clip": 1.02579188, "balance_loss_mlp": 1.00554872, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.7598811789932318, "language_loss": 0.55548251, "learning_rate": 3.5821511457621902e-06, "loss": 0.57632697, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.112018585205078 }, { "auxiliary_loss_clip": 0.01173182, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.05838549, "balance_loss_mlp": 1.02259433, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 13.854412402286249, "language_loss": 0.81403542, "learning_rate": 3.5816745144553497e-06, "loss": 0.83609927, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 2.5040903091430664 }, { "auxiliary_loss_clip": 0.01146931, "auxiliary_loss_mlp": 0.01038403, "balance_loss_clip": 1.05569947, "balance_loss_mlp": 1.02922428, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 2.4082427788422778, "language_loss": 0.75463349, "learning_rate": 3.5811976432097424e-06, "loss": 0.77648687, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 3.5705108642578125 }, { "auxiliary_loss_clip": 0.01195404, "auxiliary_loss_mlp": 0.00762539, "balance_loss_clip": 1.06213403, "balance_loss_mlp": 1.00027013, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 1.968659523671831, "language_loss": 0.844751, "learning_rate": 3.58072053209771e-06, "loss": 0.86433047, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 2.530427932739258 }, { "auxiliary_loss_clip": 0.01172055, "auxiliary_loss_mlp": 0.01044887, "balance_loss_clip": 1.05411434, "balance_loss_mlp": 1.03497469, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 1.902372856775736, "language_loss": 0.79027116, "learning_rate": 3.5802431811916296e-06, "loss": 0.81244051, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 3.267354726791382 }, { "auxiliary_loss_clip": 0.01177067, "auxiliary_loss_mlp": 0.01032203, "balance_loss_clip": 1.05877495, "balance_loss_mlp": 1.0234884, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 1.6536784843099788, "language_loss": 0.80555797, "learning_rate": 3.579765590563916e-06, "loss": 0.82765067, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 2.5258796215057373 }, { "auxiliary_loss_clip": 0.01183271, "auxiliary_loss_mlp": 0.01030168, "balance_loss_clip": 1.0581584, "balance_loss_mlp": 1.02128744, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 1.788488310365844, "language_loss": 0.81923926, "learning_rate": 3.579287760287017e-06, "loss": 0.84137368, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 3.2344515323638916 }, { "auxiliary_loss_clip": 0.01191634, "auxiliary_loss_mlp": 0.01030933, "balance_loss_clip": 1.06104338, "balance_loss_mlp": 1.02223706, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 1.997742852629854, "language_loss": 0.72768056, "learning_rate": 3.578809690433421e-06, "loss": 0.74990624, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.5673327445983887 }, { "auxiliary_loss_clip": 0.01213955, "auxiliary_loss_mlp": 0.01039986, "balance_loss_clip": 1.06467271, "balance_loss_mlp": 1.02944827, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 2.453620508513181, "language_loss": 0.80996895, "learning_rate": 3.578331381075651e-06, "loss": 0.83250839, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 2.4825472831726074 }, { "auxiliary_loss_clip": 0.01193273, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.05786991, "balance_loss_mlp": 1.02955592, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 3.0717023312988476, "language_loss": 0.69698071, "learning_rate": 3.5778528322862646e-06, "loss": 0.71930957, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.512026071548462 }, { "auxiliary_loss_clip": 0.01195439, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.05953264, "balance_loss_mlp": 1.02398121, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 1.516010526340661, "language_loss": 0.86413616, "learning_rate": 3.5773740441378585e-06, "loss": 0.88641536, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.537116765975952 }, { "auxiliary_loss_clip": 0.01190885, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.05854762, "balance_loss_mlp": 1.02264929, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 2.240994932692058, "language_loss": 0.73923457, "learning_rate": 3.5768950167030633e-06, "loss": 0.76145923, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.8036646842956543 }, { "auxiliary_loss_clip": 0.01163753, "auxiliary_loss_mlp": 0.01032652, "balance_loss_clip": 1.05149293, "balance_loss_mlp": 1.02377105, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 1.7475957464706764, "language_loss": 0.78225642, "learning_rate": 3.576415750054548e-06, "loss": 0.80422044, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.5627200603485107 }, { "auxiliary_loss_clip": 0.01169234, "auxiliary_loss_mlp": 0.01037377, "balance_loss_clip": 1.05642009, "balance_loss_mlp": 1.0279479, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 2.283613778845595, "language_loss": 0.85904598, "learning_rate": 3.5759362442650172e-06, "loss": 0.8811121, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.5083799362182617 }, { "auxiliary_loss_clip": 0.01190633, "auxiliary_loss_mlp": 0.01031823, "balance_loss_clip": 1.05948448, "balance_loss_mlp": 1.02261388, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 2.1122007247560655, "language_loss": 0.85518754, "learning_rate": 3.5754564994072113e-06, "loss": 0.87741208, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.550895929336548 }, { "auxiliary_loss_clip": 0.01177539, "auxiliary_loss_mlp": 0.01032101, "balance_loss_clip": 1.05731463, "balance_loss_mlp": 1.02307677, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 1.8658319757544783, "language_loss": 0.60132426, "learning_rate": 3.5749765155539067e-06, "loss": 0.62342072, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.627254009246826 }, { "auxiliary_loss_clip": 0.01163213, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.05547774, "balance_loss_mlp": 1.02401376, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 2.371587653648976, "language_loss": 0.92172652, "learning_rate": 3.574496292777917e-06, "loss": 0.94369113, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.5463154315948486 }, { "auxiliary_loss_clip": 0.0118499, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.05857241, "balance_loss_mlp": 1.0264461, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 2.016940641086042, "language_loss": 0.71546453, "learning_rate": 3.574015831152092e-06, "loss": 0.73768353, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.615809440612793 }, { "auxiliary_loss_clip": 0.01169963, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.05624402, "balance_loss_mlp": 1.02342296, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 2.7616166720564794, "language_loss": 0.83268374, "learning_rate": 3.573535130749316e-06, "loss": 0.85470885, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.5327565670013428 }, { "auxiliary_loss_clip": 0.01171524, "auxiliary_loss_mlp": 0.01034153, "balance_loss_clip": 1.05801272, "balance_loss_mlp": 1.02512288, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 1.9616547164735172, "language_loss": 0.73833132, "learning_rate": 3.5730541916425127e-06, "loss": 0.76038814, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.5775485038757324 }, { "auxiliary_loss_clip": 0.01164328, "auxiliary_loss_mlp": 0.01038693, "balance_loss_clip": 1.05340123, "balance_loss_mlp": 1.02992558, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 1.9065976588192277, "language_loss": 0.85866898, "learning_rate": 3.572573013904639e-06, "loss": 0.88069922, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.581597328186035 }, { "auxiliary_loss_clip": 0.01208691, "auxiliary_loss_mlp": 0.01035347, "balance_loss_clip": 1.06149447, "balance_loss_mlp": 1.02654314, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 1.9331691385627179, "language_loss": 0.91975331, "learning_rate": 3.572091597608689e-06, "loss": 0.94219375, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.4540953636169434 }, { "auxiliary_loss_clip": 0.01181341, "auxiliary_loss_mlp": 0.01034676, "balance_loss_clip": 1.05820346, "balance_loss_mlp": 1.02446032, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 3.076326390542941, "language_loss": 0.73454905, "learning_rate": 3.571609942827694e-06, "loss": 0.75670922, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 2.5505731105804443 }, { "auxiliary_loss_clip": 0.01174543, "auxiliary_loss_mlp": 0.01034605, "balance_loss_clip": 1.05405724, "balance_loss_mlp": 1.02540207, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 1.8026248125479925, "language_loss": 0.88578212, "learning_rate": 3.57112804963472e-06, "loss": 0.90787363, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 3.2919487953186035 }, { "auxiliary_loss_clip": 0.0115916, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.05656552, "balance_loss_mlp": 1.02246654, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 2.064159886850844, "language_loss": 0.76430243, "learning_rate": 3.57064591810287e-06, "loss": 0.78620666, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.523857593536377 }, { "auxiliary_loss_clip": 0.01207274, "auxiliary_loss_mlp": 0.00762472, "balance_loss_clip": 1.06205595, "balance_loss_mlp": 1.00028455, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 2.080113470344047, "language_loss": 0.80525947, "learning_rate": 3.570163548305284e-06, "loss": 0.82495689, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 2.500753164291382 }, { "auxiliary_loss_clip": 0.01176424, "auxiliary_loss_mlp": 0.01042409, "balance_loss_clip": 1.0567956, "balance_loss_mlp": 1.03224087, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 2.1019603884833056, "language_loss": 0.70253462, "learning_rate": 3.569680940315135e-06, "loss": 0.72472292, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 2.4957430362701416 }, { "auxiliary_loss_clip": 0.01169035, "auxiliary_loss_mlp": 0.01035293, "balance_loss_clip": 1.05685401, "balance_loss_mlp": 1.02512431, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 1.8038779279667028, "language_loss": 0.81704974, "learning_rate": 3.5691980942056356e-06, "loss": 0.83909303, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 2.568981170654297 }, { "auxiliary_loss_clip": 0.01194895, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.05738807, "balance_loss_mlp": 1.02254474, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 1.9710045497677662, "language_loss": 0.79586959, "learning_rate": 3.5687150100500332e-06, "loss": 0.81813693, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 3.2783961296081543 }, { "auxiliary_loss_clip": 0.01195053, "auxiliary_loss_mlp": 0.01034209, "balance_loss_clip": 1.05926847, "balance_loss_mlp": 1.02483881, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 1.5459793737879222, "language_loss": 0.74555248, "learning_rate": 3.568231687921611e-06, "loss": 0.76784509, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 2.5677361488342285 }, { "auxiliary_loss_clip": 0.01203026, "auxiliary_loss_mlp": 0.01038998, "balance_loss_clip": 1.05865788, "balance_loss_mlp": 1.03006363, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.5176859319574119, "language_loss": 0.80185992, "learning_rate": 3.5677481278936883e-06, "loss": 0.82428014, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 3.209676742553711 }, { "auxiliary_loss_clip": 0.01087552, "auxiliary_loss_mlp": 0.01003677, "balance_loss_clip": 1.03652489, "balance_loss_mlp": 1.00154293, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.826442959344438, "language_loss": 0.57792377, "learning_rate": 3.5672643300396214e-06, "loss": 0.59883606, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 3.1931276321411133 }, { "auxiliary_loss_clip": 0.01162301, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.05534077, "balance_loss_mlp": 1.02682185, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.2883617359961472, "language_loss": 0.67528361, "learning_rate": 3.566780294432802e-06, "loss": 0.69726413, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 3.3917527198791504 }, { "auxiliary_loss_clip": 0.01207256, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.06216753, "balance_loss_mlp": 1.02140689, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 2.3650443359334927, "language_loss": 0.75220799, "learning_rate": 3.566296021146657e-06, "loss": 0.77458411, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 2.4791407585144043 }, { "auxiliary_loss_clip": 0.01208808, "auxiliary_loss_mlp": 0.01036458, "balance_loss_clip": 1.06155145, "balance_loss_mlp": 1.02676046, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 1.7747653126593388, "language_loss": 0.73194784, "learning_rate": 3.565811510254652e-06, "loss": 0.75440049, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.5685746669769287 }, { "auxiliary_loss_clip": 0.01086809, "auxiliary_loss_mlp": 0.01002918, "balance_loss_clip": 1.02607262, "balance_loss_mlp": 1.00077176, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8385790091261014, "language_loss": 0.5827831, "learning_rate": 3.5653267618302845e-06, "loss": 0.60368037, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.144622802734375 }, { "auxiliary_loss_clip": 0.01206732, "auxiliary_loss_mlp": 0.01032464, "balance_loss_clip": 1.06019032, "balance_loss_mlp": 1.02249217, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 1.6352899386936546, "language_loss": 0.85499787, "learning_rate": 3.564841775947093e-06, "loss": 0.87738985, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.506077289581299 }, { "auxiliary_loss_clip": 0.01162125, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.05427802, "balance_loss_mlp": 1.02406394, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.3654227863844324, "language_loss": 0.7636494, "learning_rate": 3.5643565526786475e-06, "loss": 0.78560114, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.671215295791626 }, { "auxiliary_loss_clip": 0.01210421, "auxiliary_loss_mlp": 0.0103438, "balance_loss_clip": 1.06404459, "balance_loss_mlp": 1.02515888, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 1.6482069416505059, "language_loss": 0.7702046, "learning_rate": 3.5638710920985574e-06, "loss": 0.79265261, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.5967066287994385 }, { "auxiliary_loss_clip": 0.01198165, "auxiliary_loss_mlp": 0.00762793, "balance_loss_clip": 1.05887723, "balance_loss_mlp": 1.00031805, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 2.640323676287713, "language_loss": 0.81991374, "learning_rate": 3.5633853942804655e-06, "loss": 0.83952338, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.5839288234710693 }, { "auxiliary_loss_clip": 0.01159776, "auxiliary_loss_mlp": 0.01038652, "balance_loss_clip": 1.05117679, "balance_loss_mlp": 1.02843022, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 2.015254671773996, "language_loss": 0.76298261, "learning_rate": 3.5628994592980527e-06, "loss": 0.78496689, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.5712084770202637 }, { "auxiliary_loss_clip": 0.01205871, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.059834, "balance_loss_mlp": 1.02372253, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 1.6412887049005456, "language_loss": 0.7041977, "learning_rate": 3.562413287225034e-06, "loss": 0.7265842, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.4527695178985596 }, { "auxiliary_loss_clip": 0.01189275, "auxiliary_loss_mlp": 0.01034026, "balance_loss_clip": 1.05976534, "balance_loss_mlp": 1.02482259, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.1190734102078292, "language_loss": 0.89265573, "learning_rate": 3.5619268781351623e-06, "loss": 0.91488874, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.487014055252075 }, { "auxiliary_loss_clip": 0.0117653, "auxiliary_loss_mlp": 0.0103638, "balance_loss_clip": 1.05964136, "balance_loss_mlp": 1.02799964, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 1.7595274969323675, "language_loss": 0.77049565, "learning_rate": 3.5614402321022256e-06, "loss": 0.79262477, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.530602216720581 }, { "auxiliary_loss_clip": 0.0114047, "auxiliary_loss_mlp": 0.01042229, "balance_loss_clip": 1.05213284, "balance_loss_mlp": 1.03290057, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 1.9057146581507358, "language_loss": 0.87410343, "learning_rate": 3.5609533492000463e-06, "loss": 0.89593041, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.6270885467529297 }, { "auxiliary_loss_clip": 0.01176811, "auxiliary_loss_mlp": 0.01034568, "balance_loss_clip": 1.06277537, "balance_loss_mlp": 1.02539444, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 2.046432777296419, "language_loss": 0.78512508, "learning_rate": 3.560466229502485e-06, "loss": 0.80723882, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.5581717491149902 }, { "auxiliary_loss_clip": 0.01174557, "auxiliary_loss_mlp": 0.00762841, "balance_loss_clip": 1.06091213, "balance_loss_mlp": 1.00029492, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 3.3041719414190522, "language_loss": 0.89842248, "learning_rate": 3.5599788730834384e-06, "loss": 0.91779643, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.500875473022461 }, { "auxiliary_loss_clip": 0.0119611, "auxiliary_loss_mlp": 0.01036622, "balance_loss_clip": 1.05950332, "balance_loss_mlp": 1.02670944, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 2.055682914630254, "language_loss": 0.78562576, "learning_rate": 3.559491280016836e-06, "loss": 0.80795312, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.488459587097168 }, { "auxiliary_loss_clip": 0.01177388, "auxiliary_loss_mlp": 0.01038194, "balance_loss_clip": 1.0567838, "balance_loss_mlp": 1.02890146, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 1.7889473250699526, "language_loss": 0.71063244, "learning_rate": 3.5590034503766465e-06, "loss": 0.73278826, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 2.5506553649902344 }, { "auxiliary_loss_clip": 0.01207892, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.06358266, "balance_loss_mlp": 1.03207088, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 2.284243880907437, "language_loss": 0.8094033, "learning_rate": 3.558515384236874e-06, "loss": 0.83189559, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 3.2694685459136963 }, { "auxiliary_loss_clip": 0.01151626, "auxiliary_loss_mlp": 0.00763276, "balance_loss_clip": 1.05509472, "balance_loss_mlp": 1.00031459, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 1.8378703517683261, "language_loss": 0.8371309, "learning_rate": 3.558027081671556e-06, "loss": 0.85627991, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.533486843109131 }, { "auxiliary_loss_clip": 0.01194025, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.05664587, "balance_loss_mlp": 1.02153504, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 1.8477700213275692, "language_loss": 0.68970048, "learning_rate": 3.557538542754769e-06, "loss": 0.71195263, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 2.5401432514190674 }, { "auxiliary_loss_clip": 0.0120799, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.06237936, "balance_loss_mlp": 1.02566147, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 1.7935020251532408, "language_loss": 0.66712642, "learning_rate": 3.557049767560623e-06, "loss": 0.68955326, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 2.4828922748565674 }, { "auxiliary_loss_clip": 0.0115148, "auxiliary_loss_mlp": 0.01030329, "balance_loss_clip": 1.05629671, "balance_loss_mlp": 1.02146566, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 2.450885374296754, "language_loss": 0.85910034, "learning_rate": 3.5565607561632655e-06, "loss": 0.88091838, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.6355857849121094 }, { "auxiliary_loss_clip": 0.01174972, "auxiliary_loss_mlp": 0.01030755, "balance_loss_clip": 1.05711079, "balance_loss_mlp": 1.02110481, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 2.7263925761049914, "language_loss": 0.79758215, "learning_rate": 3.5560715086368787e-06, "loss": 0.81963938, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 3.3718197345733643 }, { "auxiliary_loss_clip": 0.01173836, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.0590694, "balance_loss_mlp": 1.02864254, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 1.9421676909343608, "language_loss": 0.8211785, "learning_rate": 3.5555820250556816e-06, "loss": 0.84329504, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 2.5237960815429688 }, { "auxiliary_loss_clip": 0.01178616, "auxiliary_loss_mlp": 0.01039295, "balance_loss_clip": 1.05872345, "balance_loss_mlp": 1.02992523, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 2.1761493654806574, "language_loss": 0.69464111, "learning_rate": 3.5550923054939278e-06, "loss": 0.71682024, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 3.2587521076202393 }, { "auxiliary_loss_clip": 0.01136948, "auxiliary_loss_mlp": 0.01038292, "balance_loss_clip": 1.04900229, "balance_loss_mlp": 1.02932119, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 1.7486625121183688, "language_loss": 0.74281919, "learning_rate": 3.5546023500259083e-06, "loss": 0.76457155, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 2.6277942657470703 }, { "auxiliary_loss_clip": 0.01150816, "auxiliary_loss_mlp": 0.01037663, "balance_loss_clip": 1.05213046, "balance_loss_mlp": 1.02893686, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 1.8432603497315647, "language_loss": 0.80408478, "learning_rate": 3.5541121587259477e-06, "loss": 0.82596958, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 3.370062828063965 }, { "auxiliary_loss_clip": 0.01085386, "auxiliary_loss_mlp": 0.01003869, "balance_loss_clip": 1.02732277, "balance_loss_mlp": 1.00159252, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.9051604696045212, "language_loss": 0.57934153, "learning_rate": 3.553621731668408e-06, "loss": 0.60023409, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 3.0811831951141357 }, { "auxiliary_loss_clip": 0.01182294, "auxiliary_loss_mlp": 0.01033074, "balance_loss_clip": 1.05519915, "balance_loss_mlp": 1.02321529, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 2.115910824762654, "language_loss": 0.82974321, "learning_rate": 3.553131068927688e-06, "loss": 0.85189688, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.5622522830963135 }, { "auxiliary_loss_clip": 0.01160126, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.05595148, "balance_loss_mlp": 1.02344012, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 1.591205870386221, "language_loss": 0.80264258, "learning_rate": 3.552640170578219e-06, "loss": 0.82456821, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.5879533290863037 }, { "auxiliary_loss_clip": 0.01174166, "auxiliary_loss_mlp": 0.01033201, "balance_loss_clip": 1.05526066, "balance_loss_mlp": 1.02407598, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 2.3526902839227986, "language_loss": 0.78394145, "learning_rate": 3.5521490366944703e-06, "loss": 0.80601513, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.5069727897644043 }, { "auxiliary_loss_clip": 0.01161988, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.05648637, "balance_loss_mlp": 1.02221417, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.491432139765458, "language_loss": 0.80301785, "learning_rate": 3.5516576673509474e-06, "loss": 0.824947, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.532531976699829 }, { "auxiliary_loss_clip": 0.01207744, "auxiliary_loss_mlp": 0.0103969, "balance_loss_clip": 1.06081462, "balance_loss_mlp": 1.03015327, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 1.6572023190842826, "language_loss": 0.8611142, "learning_rate": 3.5511660626221896e-06, "loss": 0.88358855, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.546412229537964 }, { "auxiliary_loss_clip": 0.01177068, "auxiliary_loss_mlp": 0.00763545, "balance_loss_clip": 1.06065214, "balance_loss_mlp": 1.0003792, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 2.2442541005385785, "language_loss": 0.8914994, "learning_rate": 3.5506742225827744e-06, "loss": 0.91090548, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.5307891368865967 }, { "auxiliary_loss_clip": 0.01160792, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.05623555, "balance_loss_mlp": 1.02810907, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.284210840467783, "language_loss": 0.90229243, "learning_rate": 3.5501821473073116e-06, "loss": 0.92426932, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.5934860706329346 }, { "auxiliary_loss_clip": 0.01156125, "auxiliary_loss_mlp": 0.01043443, "balance_loss_clip": 1.05615366, "balance_loss_mlp": 1.03454995, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 1.9923388271111364, "language_loss": 0.86867446, "learning_rate": 3.54968983687045e-06, "loss": 0.89067018, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.5337555408477783 }, { "auxiliary_loss_clip": 0.0117583, "auxiliary_loss_mlp": 0.01040906, "balance_loss_clip": 1.0557282, "balance_loss_mlp": 1.03131557, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 2.6944936364528567, "language_loss": 0.89282227, "learning_rate": 3.549197291346872e-06, "loss": 0.91498965, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.50006365776062 }, { "auxiliary_loss_clip": 0.01191005, "auxiliary_loss_mlp": 0.01036046, "balance_loss_clip": 1.05666161, "balance_loss_mlp": 1.02691436, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 2.0266248379685807, "language_loss": 0.79381126, "learning_rate": 3.548704510811297e-06, "loss": 0.81608176, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.5244953632354736 }, { "auxiliary_loss_clip": 0.01152169, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 1.05248916, "balance_loss_mlp": 1.02698994, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 2.4467328807957167, "language_loss": 0.74703932, "learning_rate": 3.5482114953384787e-06, "loss": 0.7689209, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.631364583969116 }, { "auxiliary_loss_clip": 0.01194041, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.05972624, "balance_loss_mlp": 1.02607703, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 2.093571568753876, "language_loss": 0.84556746, "learning_rate": 3.5477182450032077e-06, "loss": 0.8678658, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.475832939147949 }, { "auxiliary_loss_clip": 0.01191702, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.05906332, "balance_loss_mlp": 1.02649117, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 4.263210120591379, "language_loss": 0.83227575, "learning_rate": 3.5472247598803097e-06, "loss": 0.85455811, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.503694534301758 }, { "auxiliary_loss_clip": 0.01205802, "auxiliary_loss_mlp": 0.01033374, "balance_loss_clip": 1.05881751, "balance_loss_mlp": 1.02371764, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 2.3139497750952147, "language_loss": 0.85301667, "learning_rate": 3.546731040044645e-06, "loss": 0.87540841, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.5016987323760986 }, { "auxiliary_loss_clip": 0.01205498, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.06111646, "balance_loss_mlp": 1.02622962, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 1.7877170456863605, "language_loss": 0.74725783, "learning_rate": 3.546237085571112e-06, "loss": 0.76966894, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 3.248819589614868 }, { "auxiliary_loss_clip": 0.01194043, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.060992, "balance_loss_mlp": 1.02166653, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 2.0165849150602617, "language_loss": 0.7227695, "learning_rate": 3.5457428965346425e-06, "loss": 0.74502295, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.5165798664093018 }, { "auxiliary_loss_clip": 0.01134629, "auxiliary_loss_mlp": 0.01031491, "balance_loss_clip": 1.05250359, "balance_loss_mlp": 1.02195454, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 2.99718603739699, "language_loss": 0.75075448, "learning_rate": 3.545248473010205e-06, "loss": 0.77241564, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 2.7432687282562256 }, { "auxiliary_loss_clip": 0.01208451, "auxiliary_loss_mlp": 0.00763762, "balance_loss_clip": 1.06087685, "balance_loss_mlp": 1.00045896, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 2.2962792903012987, "language_loss": 0.8779695, "learning_rate": 3.544753815072802e-06, "loss": 0.89769161, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 2.511944532394409 }, { "auxiliary_loss_clip": 0.01111044, "auxiliary_loss_mlp": 0.01032311, "balance_loss_clip": 1.04531264, "balance_loss_mlp": 1.02258956, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 1.8175690353899094, "language_loss": 0.88453114, "learning_rate": 3.544258922797474e-06, "loss": 0.90596473, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 2.6642651557922363 }, { "auxiliary_loss_clip": 0.01205042, "auxiliary_loss_mlp": 0.01039417, "balance_loss_clip": 1.06026077, "balance_loss_mlp": 1.03032136, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 1.5714857818439214, "language_loss": 0.78236067, "learning_rate": 3.543763796259295e-06, "loss": 0.80480522, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 2.5179452896118164 }, { "auxiliary_loss_clip": 0.01190779, "auxiliary_loss_mlp": 0.01035655, "balance_loss_clip": 1.05706573, "balance_loss_mlp": 1.02559364, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 1.849859695734387, "language_loss": 0.91005909, "learning_rate": 3.5432684355333754e-06, "loss": 0.9323234, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 3.3127622604370117 }, { "auxiliary_loss_clip": 0.0119032, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.05648541, "balance_loss_mlp": 1.02508187, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 1.9701197398192898, "language_loss": 0.7641958, "learning_rate": 3.5427728406948613e-06, "loss": 0.78643584, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 2.5378661155700684 }, { "auxiliary_loss_clip": 0.01078473, "auxiliary_loss_mlp": 0.01003422, "balance_loss_clip": 1.02431774, "balance_loss_mlp": 1.00139511, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7626906419244385, "language_loss": 0.57938707, "learning_rate": 3.542277011818934e-06, "loss": 0.60020602, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 3.9090769290924072 }, { "auxiliary_loss_clip": 0.0118045, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.06094098, "balance_loss_mlp": 1.02604818, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.0957282987055907, "language_loss": 0.74553931, "learning_rate": 3.5417809489808104e-06, "loss": 0.76769358, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 2.7028770446777344 }, { "auxiliary_loss_clip": 0.01195641, "auxiliary_loss_mlp": 0.01036561, "balance_loss_clip": 1.06112671, "balance_loss_mlp": 1.02827024, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 1.787530232634607, "language_loss": 0.72663289, "learning_rate": 3.5412846522557422e-06, "loss": 0.74895489, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 3.3037662506103516 }, { "auxiliary_loss_clip": 0.01205974, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.05987763, "balance_loss_mlp": 1.02572715, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 2.021311320657345, "language_loss": 0.7433567, "learning_rate": 3.540788121719018e-06, "loss": 0.76577342, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 2.450728178024292 }, { "auxiliary_loss_clip": 0.01152816, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.0572741, "balance_loss_mlp": 1.0219568, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 2.35238184545431, "language_loss": 0.81752419, "learning_rate": 3.5402913574459604e-06, "loss": 0.83936799, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.5941474437713623 }, { "auxiliary_loss_clip": 0.01127306, "auxiliary_loss_mlp": 0.01034765, "balance_loss_clip": 1.04753542, "balance_loss_mlp": 1.02615201, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 1.652752112566464, "language_loss": 0.86068058, "learning_rate": 3.5397943595119297e-06, "loss": 0.88230133, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 2.690549373626709 }, { "auxiliary_loss_clip": 0.01175971, "auxiliary_loss_mlp": 0.01035581, "balance_loss_clip": 1.05985713, "balance_loss_mlp": 1.02618742, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 2.25750268638344, "language_loss": 0.77470267, "learning_rate": 3.5392971279923177e-06, "loss": 0.79681814, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.554649591445923 }, { "auxiliary_loss_clip": 0.0115731, "auxiliary_loss_mlp": 0.01035961, "balance_loss_clip": 1.05277944, "balance_loss_mlp": 1.02675223, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 1.968471344433543, "language_loss": 0.82893121, "learning_rate": 3.5387996629625557e-06, "loss": 0.85086393, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.5910468101501465 }, { "auxiliary_loss_clip": 0.0109859, "auxiliary_loss_mlp": 0.01002684, "balance_loss_clip": 1.02401328, "balance_loss_mlp": 1.00078857, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.7970891012011679, "language_loss": 0.55021167, "learning_rate": 3.5383019644981083e-06, "loss": 0.57122439, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.0983214378356934 }, { "auxiliary_loss_clip": 0.01175172, "auxiliary_loss_mlp": 0.01044099, "balance_loss_clip": 1.05758572, "balance_loss_mlp": 1.03400171, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 2.219549242306186, "language_loss": 0.73066509, "learning_rate": 3.5378040326744763e-06, "loss": 0.7528578, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.5473358631134033 }, { "auxiliary_loss_clip": 0.01162869, "auxiliary_loss_mlp": 0.01046395, "balance_loss_clip": 1.05821848, "balance_loss_mlp": 1.03686976, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.021085035112213, "language_loss": 0.85706258, "learning_rate": 3.5373058675671946e-06, "loss": 0.87915528, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.5618579387664795 }, { "auxiliary_loss_clip": 0.01140193, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.05263758, "balance_loss_mlp": 1.02587271, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 2.288698870043243, "language_loss": 0.72526121, "learning_rate": 3.536807469251836e-06, "loss": 0.74702138, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.6115474700927734 }, { "auxiliary_loss_clip": 0.01165045, "auxiliary_loss_mlp": 0.01025909, "balance_loss_clip": 1.05133176, "balance_loss_mlp": 1.01663458, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 1.8276958591021424, "language_loss": 0.82867777, "learning_rate": 3.5363088378040055e-06, "loss": 0.85058737, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.5768396854400635 }, { "auxiliary_loss_clip": 0.01098434, "auxiliary_loss_mlp": 0.00752922, "balance_loss_clip": 1.02418447, "balance_loss_mlp": 1.00004292, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7489855076037047, "language_loss": 0.64356792, "learning_rate": 3.5358099732993463e-06, "loss": 0.66208148, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.021388292312622 }, { "auxiliary_loss_clip": 0.01182785, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.05862236, "balance_loss_mlp": 1.02690768, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 1.7820900520839353, "language_loss": 0.89266747, "learning_rate": 3.535310875813535e-06, "loss": 0.9148528, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.5306057929992676 }, { "auxiliary_loss_clip": 0.01187181, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.05693364, "balance_loss_mlp": 1.0250572, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 1.678557160204201, "language_loss": 0.81141776, "learning_rate": 3.5348115454222843e-06, "loss": 0.83362186, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.5718765258789062 }, { "auxiliary_loss_clip": 0.01169307, "auxiliary_loss_mlp": 0.01036971, "balance_loss_clip": 1.05222261, "balance_loss_mlp": 1.02763057, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 2.0142434743497284, "language_loss": 0.86593926, "learning_rate": 3.5343119822013425e-06, "loss": 0.88800204, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.5527615547180176 }, { "auxiliary_loss_clip": 0.01196606, "auxiliary_loss_mlp": 0.01032423, "balance_loss_clip": 1.05854142, "balance_loss_mlp": 1.02242136, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 1.940488092489145, "language_loss": 0.77749395, "learning_rate": 3.533812186226493e-06, "loss": 0.79978424, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.518183469772339 }, { "auxiliary_loss_clip": 0.01199948, "auxiliary_loss_mlp": 0.01028216, "balance_loss_clip": 1.05753016, "balance_loss_mlp": 1.02033639, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.7055034204825394, "language_loss": 0.75839496, "learning_rate": 3.5333121575735545e-06, "loss": 0.78067654, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 3.232445478439331 }, { "auxiliary_loss_clip": 0.01179193, "auxiliary_loss_mlp": 0.0103759, "balance_loss_clip": 1.05984545, "balance_loss_mlp": 1.02894139, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 1.7834670793592902, "language_loss": 0.75444269, "learning_rate": 3.532811896318381e-06, "loss": 0.77661055, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.619645357131958 }, { "auxiliary_loss_clip": 0.01162368, "auxiliary_loss_mlp": 0.01041288, "balance_loss_clip": 1.05537295, "balance_loss_mlp": 1.031847, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 2.091033297684415, "language_loss": 0.8204518, "learning_rate": 3.5323114025368615e-06, "loss": 0.84248829, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 2.6189913749694824 }, { "auxiliary_loss_clip": 0.01184145, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.0530808, "balance_loss_mlp": 1.02248442, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 2.4633406898713006, "language_loss": 0.81721365, "learning_rate": 3.53181067630492e-06, "loss": 0.8393712, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 2.473313093185425 }, { "auxiliary_loss_clip": 0.0116864, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.05474043, "balance_loss_mlp": 1.02580309, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 2.0079178219909957, "language_loss": 0.7653147, "learning_rate": 3.5313097176985175e-06, "loss": 0.7873522, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.506767511367798 }, { "auxiliary_loss_clip": 0.01190721, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.05976248, "balance_loss_mlp": 1.02517796, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 1.7935581894155082, "language_loss": 0.81077057, "learning_rate": 3.5308085267936482e-06, "loss": 0.83302152, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 2.4861443042755127 }, { "auxiliary_loss_clip": 0.01137151, "auxiliary_loss_mlp": 0.00761856, "balance_loss_clip": 1.0561167, "balance_loss_mlp": 1.00044453, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 1.9186084300162136, "language_loss": 0.9009074, "learning_rate": 3.530307103666342e-06, "loss": 0.9198975, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 3.4017856121063232 }, { "auxiliary_loss_clip": 0.01165902, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.0556159, "balance_loss_mlp": 1.02494073, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 2.097803367019977, "language_loss": 0.80329895, "learning_rate": 3.5298054483926658e-06, "loss": 0.82530546, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 2.590782403945923 }, { "auxiliary_loss_clip": 0.01198909, "auxiliary_loss_mlp": 0.01039951, "balance_loss_clip": 1.06117129, "balance_loss_mlp": 1.03038454, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 1.8533408405161553, "language_loss": 0.83150482, "learning_rate": 3.5293035610487187e-06, "loss": 0.8538934, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 3.2985732555389404 }, { "auxiliary_loss_clip": 0.01070202, "auxiliary_loss_mlp": 0.0100323, "balance_loss_clip": 1.02213573, "balance_loss_mlp": 1.001001, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.7325648743332753, "language_loss": 0.62062263, "learning_rate": 3.5288014417106374e-06, "loss": 0.64135695, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 3.162182092666626 }, { "auxiliary_loss_clip": 0.01161717, "auxiliary_loss_mlp": 0.0103565, "balance_loss_clip": 1.05762959, "balance_loss_mlp": 1.02590477, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 1.865234087526529, "language_loss": 0.75888181, "learning_rate": 3.528299090454593e-06, "loss": 0.78085554, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 3.4445135593414307 }, { "auxiliary_loss_clip": 0.01191136, "auxiliary_loss_mlp": 0.01043227, "balance_loss_clip": 1.0559305, "balance_loss_mlp": 1.03380966, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.2678396637925458, "language_loss": 0.82837576, "learning_rate": 3.527796507356792e-06, "loss": 0.85071939, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 2.500074625015259 }, { "auxiliary_loss_clip": 0.01193363, "auxiliary_loss_mlp": 0.01031498, "balance_loss_clip": 1.0587976, "balance_loss_mlp": 1.02242625, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 3.3296230919624876, "language_loss": 0.8998152, "learning_rate": 3.527293692493475e-06, "loss": 0.92206389, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.4892115592956543 }, { "auxiliary_loss_clip": 0.01190571, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.05477738, "balance_loss_mlp": 1.02172542, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 2.5222571551502786, "language_loss": 0.73614401, "learning_rate": 3.52679064594092e-06, "loss": 0.75836802, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.6006484031677246 }, { "auxiliary_loss_clip": 0.01132797, "auxiliary_loss_mlp": 0.0103598, "balance_loss_clip": 1.04427433, "balance_loss_mlp": 1.02690172, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 2.2848898466985132, "language_loss": 0.74934554, "learning_rate": 3.5262873677754375e-06, "loss": 0.77103329, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.5893354415893555 }, { "auxiliary_loss_clip": 0.0120401, "auxiliary_loss_mlp": 0.01042171, "balance_loss_clip": 1.05970359, "balance_loss_mlp": 1.03309894, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 1.566083052593149, "language_loss": 0.80220222, "learning_rate": 3.5257838580733745e-06, "loss": 0.82466412, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.5330142974853516 }, { "auxiliary_loss_clip": 0.01195508, "auxiliary_loss_mlp": 0.0104386, "balance_loss_clip": 1.06034207, "balance_loss_mlp": 1.03423965, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 1.8923706671773803, "language_loss": 0.86993611, "learning_rate": 3.5252801169111138e-06, "loss": 0.89232969, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.4767906665802 }, { "auxiliary_loss_clip": 0.01174928, "auxiliary_loss_mlp": 0.01041866, "balance_loss_clip": 1.05937994, "balance_loss_mlp": 1.03252041, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.790332618827767, "language_loss": 0.8024618, "learning_rate": 3.524776144365072e-06, "loss": 0.82462978, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.5512232780456543 }, { "auxiliary_loss_clip": 0.01175422, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.06181335, "balance_loss_mlp": 1.02574062, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.549440954345555, "language_loss": 0.79298759, "learning_rate": 3.5242719405117016e-06, "loss": 0.81508446, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.5761241912841797 }, { "auxiliary_loss_clip": 0.0117647, "auxiliary_loss_mlp": 0.00762459, "balance_loss_clip": 1.05689943, "balance_loss_mlp": 1.00044179, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 5.204197120095358, "language_loss": 0.75026548, "learning_rate": 3.5237675054274893e-06, "loss": 0.76965475, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.5468244552612305 }, { "auxiliary_loss_clip": 0.01193114, "auxiliary_loss_mlp": 0.01039295, "balance_loss_clip": 1.05975783, "balance_loss_mlp": 1.02966273, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 2.0107763475090743, "language_loss": 0.80155277, "learning_rate": 3.5232628391889584e-06, "loss": 0.82387686, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.509671688079834 }, { "auxiliary_loss_clip": 0.01146515, "auxiliary_loss_mlp": 0.01035557, "balance_loss_clip": 1.05579603, "balance_loss_mlp": 1.0269202, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 2.449697804561298, "language_loss": 0.64268124, "learning_rate": 3.522757941872666e-06, "loss": 0.66450202, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.612285614013672 }, { "auxiliary_loss_clip": 0.01207944, "auxiliary_loss_mlp": 0.00762332, "balance_loss_clip": 1.06338608, "balance_loss_mlp": 1.00036645, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.5184630795666025, "language_loss": 0.82452261, "learning_rate": 3.5222528135552042e-06, "loss": 0.84422541, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.5098040103912354 }, { "auxiliary_loss_clip": 0.01191525, "auxiliary_loss_mlp": 0.01036135, "balance_loss_clip": 1.06176221, "balance_loss_mlp": 1.02699172, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 2.2323221488857756, "language_loss": 0.80416685, "learning_rate": 3.521747454313201e-06, "loss": 0.82644343, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.4983997344970703 }, { "auxiliary_loss_clip": 0.011522, "auxiliary_loss_mlp": 0.01032926, "balance_loss_clip": 1.0506078, "balance_loss_mlp": 1.02335382, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 1.918198881903483, "language_loss": 0.6701591, "learning_rate": 3.521241864223319e-06, "loss": 0.69201028, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.563106060028076 }, { "auxiliary_loss_clip": 0.01074165, "auxiliary_loss_mlp": 0.01008474, "balance_loss_clip": 1.01900458, "balance_loss_mlp": 1.00638759, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 0.8177070594454853, "language_loss": 0.62068152, "learning_rate": 3.5207360433622552e-06, "loss": 0.64150786, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.1466333866119385 }, { "auxiliary_loss_clip": 0.01171376, "auxiliary_loss_mlp": 0.01041482, "balance_loss_clip": 1.05841732, "balance_loss_mlp": 1.03248215, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 1.5564426910655398, "language_loss": 0.74747288, "learning_rate": 3.5202299918067437e-06, "loss": 0.76960146, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 3.5447330474853516 }, { "auxiliary_loss_clip": 0.01189841, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.06004012, "balance_loss_mlp": 1.02502561, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.0580186322027294, "language_loss": 0.6887306, "learning_rate": 3.519723709633551e-06, "loss": 0.7109707, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.506460666656494 }, { "auxiliary_loss_clip": 0.01170836, "auxiliary_loss_mlp": 0.01038494, "balance_loss_clip": 1.05912971, "balance_loss_mlp": 1.02940392, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 1.7128055585671824, "language_loss": 0.83352184, "learning_rate": 3.519217196919479e-06, "loss": 0.85561514, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 2.567106008529663 }, { "auxiliary_loss_clip": 0.01181899, "auxiliary_loss_mlp": 0.010442, "balance_loss_clip": 1.0613538, "balance_loss_mlp": 1.03558707, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 1.7242000759708935, "language_loss": 0.73252726, "learning_rate": 3.518710453741367e-06, "loss": 0.75478828, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 2.5544862747192383 }, { "auxiliary_loss_clip": 0.01168025, "auxiliary_loss_mlp": 0.00763066, "balance_loss_clip": 1.05652595, "balance_loss_mlp": 1.00037408, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 2.178248446955164, "language_loss": 0.67952657, "learning_rate": 3.518203480176086e-06, "loss": 0.69883752, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.5554754734039307 }, { "auxiliary_loss_clip": 0.01117021, "auxiliary_loss_mlp": 0.01041812, "balance_loss_clip": 1.04755211, "balance_loss_mlp": 1.03318107, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 1.7149304999368582, "language_loss": 0.80937308, "learning_rate": 3.517696276300545e-06, "loss": 0.83096147, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 2.7007131576538086 }, { "auxiliary_loss_clip": 0.01189421, "auxiliary_loss_mlp": 0.01038009, "balance_loss_clip": 1.06144381, "balance_loss_mlp": 1.02859771, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 2.9071190556951394, "language_loss": 0.69728673, "learning_rate": 3.517188842191685e-06, "loss": 0.71956104, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 3.2773730754852295 }, { "auxiliary_loss_clip": 0.01189534, "auxiliary_loss_mlp": 0.0103721, "balance_loss_clip": 1.05817795, "balance_loss_mlp": 1.02797759, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.6248453422050848, "language_loss": 0.73864591, "learning_rate": 3.5166811779264837e-06, "loss": 0.76091337, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 2.5455198287963867 }, { "auxiliary_loss_clip": 0.01203516, "auxiliary_loss_mlp": 0.01037237, "balance_loss_clip": 1.05921721, "balance_loss_mlp": 1.02807021, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 1.9100819437822707, "language_loss": 0.78185451, "learning_rate": 3.5161732835819545e-06, "loss": 0.80426204, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 3.1958301067352295 }, { "auxiliary_loss_clip": 0.01204043, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.06138825, "balance_loss_mlp": 1.02485442, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 1.9277585970222786, "language_loss": 0.83428413, "learning_rate": 3.515665159235143e-06, "loss": 0.85667002, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 3.2876856327056885 }, { "auxiliary_loss_clip": 0.01175188, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.05554438, "balance_loss_mlp": 1.0281347, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 2.669701419654636, "language_loss": 0.74711627, "learning_rate": 3.5151568049631318e-06, "loss": 0.7692312, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 2.5215775966644287 }, { "auxiliary_loss_clip": 0.01206449, "auxiliary_loss_mlp": 0.01040118, "balance_loss_clip": 1.06116295, "balance_loss_mlp": 1.03134406, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 1.7002361584094747, "language_loss": 0.80480045, "learning_rate": 3.5146482208430385e-06, "loss": 0.8272661, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 2.5824756622314453 }, { "auxiliary_loss_clip": 0.01118486, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.04584372, "balance_loss_mlp": 1.02538824, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 2.499260368557853, "language_loss": 0.68201607, "learning_rate": 3.514139406952014e-06, "loss": 0.7035532, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 2.6846957206726074 }, { "auxiliary_loss_clip": 0.01190757, "auxiliary_loss_mlp": 0.01030373, "balance_loss_clip": 1.06097293, "balance_loss_mlp": 1.02126527, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 1.9040632706540188, "language_loss": 0.8373872, "learning_rate": 3.5136303633672454e-06, "loss": 0.85959852, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 2.5518417358398438 }, { "auxiliary_loss_clip": 0.0116788, "auxiliary_loss_mlp": 0.00762148, "balance_loss_clip": 1.05626631, "balance_loss_mlp": 1.00039148, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 1.865780153640428, "language_loss": 0.7474227, "learning_rate": 3.5131210901659544e-06, "loss": 0.76672298, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.6499993801116943 }, { "auxiliary_loss_clip": 0.01157323, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.05306995, "balance_loss_mlp": 1.02587962, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 2.323326325765046, "language_loss": 0.82425666, "learning_rate": 3.5126115874253967e-06, "loss": 0.84618562, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.5925650596618652 }, { "auxiliary_loss_clip": 0.01164205, "auxiliary_loss_mlp": 0.01036635, "balance_loss_clip": 1.05860615, "balance_loss_mlp": 1.02758682, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 1.7880296153258421, "language_loss": 0.81045371, "learning_rate": 3.5121018552228644e-06, "loss": 0.83246213, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.613863945007324 }, { "auxiliary_loss_clip": 0.01164347, "auxiliary_loss_mlp": 0.01033491, "balance_loss_clip": 1.05686331, "balance_loss_mlp": 1.02442479, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 1.9689271835918365, "language_loss": 0.76019555, "learning_rate": 3.5115918936356827e-06, "loss": 0.78217387, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.5620858669281006 }, { "auxiliary_loss_clip": 0.01141699, "auxiliary_loss_mlp": 0.01045445, "balance_loss_clip": 1.05294764, "balance_loss_mlp": 1.03678417, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 1.819986072040086, "language_loss": 0.78559077, "learning_rate": 3.5110817027412123e-06, "loss": 0.80746222, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.539010524749756 }, { "auxiliary_loss_clip": 0.01159257, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.05388248, "balance_loss_mlp": 1.02149701, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 1.789950671282387, "language_loss": 0.68647921, "learning_rate": 3.5105712826168493e-06, "loss": 0.70837021, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.6275196075439453 }, { "auxiliary_loss_clip": 0.01191029, "auxiliary_loss_mlp": 0.00762286, "balance_loss_clip": 1.05962276, "balance_loss_mlp": 1.00037646, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 1.8017950535184446, "language_loss": 0.71048778, "learning_rate": 3.5100606333400235e-06, "loss": 0.73002088, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.5097768306732178 }, { "auxiliary_loss_clip": 0.01186012, "auxiliary_loss_mlp": 0.01039894, "balance_loss_clip": 1.05873084, "balance_loss_mlp": 1.02977324, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.4573984633943464, "language_loss": 0.76862836, "learning_rate": 3.5095497549882006e-06, "loss": 0.79088742, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.5426225662231445 }, { "auxiliary_loss_clip": 0.01193081, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.06319857, "balance_loss_mlp": 1.02611935, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 1.8356329869092307, "language_loss": 0.72263098, "learning_rate": 3.50903864763888e-06, "loss": 0.74492085, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.623161554336548 }, { "auxiliary_loss_clip": 0.01196381, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.06106842, "balance_loss_mlp": 1.02337766, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 1.9717420473041052, "language_loss": 0.76628649, "learning_rate": 3.5085273113695965e-06, "loss": 0.78858632, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.7457163333892822 }, { "auxiliary_loss_clip": 0.01209828, "auxiliary_loss_mlp": 0.0104076, "balance_loss_clip": 1.06389046, "balance_loss_mlp": 1.03130674, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 1.734323545093817, "language_loss": 0.78405309, "learning_rate": 3.508015746257919e-06, "loss": 0.80655897, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.5342354774475098 }, { "auxiliary_loss_clip": 0.01166418, "auxiliary_loss_mlp": 0.01042085, "balance_loss_clip": 1.0587194, "balance_loss_mlp": 1.03219056, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 2.116030521161892, "language_loss": 0.83703411, "learning_rate": 3.5075039523814518e-06, "loss": 0.85911918, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.5807902812957764 }, { "auxiliary_loss_clip": 0.01195807, "auxiliary_loss_mlp": 0.01045744, "balance_loss_clip": 1.05988741, "balance_loss_mlp": 1.03544414, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 2.095580782107945, "language_loss": 0.81912535, "learning_rate": 3.506991929817834e-06, "loss": 0.84154093, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 3.259746551513672 }, { "auxiliary_loss_clip": 0.01204167, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.0632726, "balance_loss_mlp": 1.03263104, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 2.023346688494999, "language_loss": 0.82526082, "learning_rate": 3.506479678644738e-06, "loss": 0.84772122, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.5011889934539795 }, { "auxiliary_loss_clip": 0.01145885, "auxiliary_loss_mlp": 0.01043841, "balance_loss_clip": 1.0565896, "balance_loss_mlp": 1.03401804, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 3.581113409802518, "language_loss": 0.73980832, "learning_rate": 3.505967198939873e-06, "loss": 0.76170564, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 2.654013156890869 }, { "auxiliary_loss_clip": 0.01173998, "auxiliary_loss_mlp": 0.01036961, "balance_loss_clip": 1.05433595, "balance_loss_mlp": 1.02756119, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 2.115135815112557, "language_loss": 0.78394109, "learning_rate": 3.5054544907809813e-06, "loss": 0.80605066, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 2.6835176944732666 }, { "auxiliary_loss_clip": 0.011748, "auxiliary_loss_mlp": 0.00763251, "balance_loss_clip": 1.06069016, "balance_loss_mlp": 1.00034046, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 2.462911772878111, "language_loss": 0.80014294, "learning_rate": 3.50494155424584e-06, "loss": 0.81952345, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 2.5501749515533447 }, { "auxiliary_loss_clip": 0.01196865, "auxiliary_loss_mlp": 0.01035615, "balance_loss_clip": 1.06081152, "balance_loss_mlp": 1.02576864, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 1.972974160286277, "language_loss": 0.83279347, "learning_rate": 3.504428389412262e-06, "loss": 0.85511827, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 2.5111234188079834 }, { "auxiliary_loss_clip": 0.01192198, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.05998671, "balance_loss_mlp": 1.02970874, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 2.319830016867886, "language_loss": 0.72900105, "learning_rate": 3.5039149963580927e-06, "loss": 0.75131249, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 3.342381000518799 }, { "auxiliary_loss_clip": 0.01173367, "auxiliary_loss_mlp": 0.0103728, "balance_loss_clip": 1.05992961, "balance_loss_mlp": 1.02814806, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.900700741504068, "language_loss": 0.70031005, "learning_rate": 3.503401375161215e-06, "loss": 0.72241652, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 2.5933611392974854 }, { "auxiliary_loss_clip": 0.01205228, "auxiliary_loss_mlp": 0.01035232, "balance_loss_clip": 1.06217742, "balance_loss_mlp": 1.02559412, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 1.4975971102277843, "language_loss": 0.84040827, "learning_rate": 3.502887525899544e-06, "loss": 0.86281288, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 3.214930534362793 }, { "auxiliary_loss_clip": 0.01180835, "auxiliary_loss_mlp": 0.01038398, "balance_loss_clip": 1.06130552, "balance_loss_mlp": 1.02837813, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 2.6544588653006915, "language_loss": 0.82965523, "learning_rate": 3.50237344865103e-06, "loss": 0.85184753, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 3.407270908355713 }, { "auxiliary_loss_clip": 0.01208147, "auxiliary_loss_mlp": 0.01039477, "balance_loss_clip": 1.06271935, "balance_loss_mlp": 1.03008342, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 2.5576713274023155, "language_loss": 0.76497495, "learning_rate": 3.501859143493658e-06, "loss": 0.78745115, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.5647566318511963 }, { "auxiliary_loss_clip": 0.0110545, "auxiliary_loss_mlp": 0.01009973, "balance_loss_clip": 1.03026092, "balance_loss_mlp": 1.00786293, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9579107086231944, "language_loss": 0.60540617, "learning_rate": 3.5013446105054488e-06, "loss": 0.62656045, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 2.8456525802612305 }, { "auxiliary_loss_clip": 0.0114601, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.05182552, "balance_loss_mlp": 1.02411246, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 2.06065472405125, "language_loss": 0.74976468, "learning_rate": 3.5008298497644555e-06, "loss": 0.77155918, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 2.583155393600464 }, { "auxiliary_loss_clip": 0.01166673, "auxiliary_loss_mlp": 0.01036432, "balance_loss_clip": 1.05919147, "balance_loss_mlp": 1.02643633, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 1.6561198917650972, "language_loss": 0.87785357, "learning_rate": 3.500314861348767e-06, "loss": 0.89988458, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 2.618145704269409 }, { "auxiliary_loss_clip": 0.01154501, "auxiliary_loss_mlp": 0.01039661, "balance_loss_clip": 1.058846, "balance_loss_mlp": 1.02983856, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 1.7912937061033667, "language_loss": 0.77223045, "learning_rate": 3.499799645336507e-06, "loss": 0.79417205, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.5525567531585693 }, { "auxiliary_loss_clip": 0.01194877, "auxiliary_loss_mlp": 0.01031485, "balance_loss_clip": 1.06285405, "balance_loss_mlp": 1.02284861, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 2.0471050882725623, "language_loss": 0.86910975, "learning_rate": 3.4992842018058336e-06, "loss": 0.8913734, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.6110053062438965 }, { "auxiliary_loss_clip": 0.0116926, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.0570128, "balance_loss_mlp": 1.02326012, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 1.9113913928393098, "language_loss": 0.88225496, "learning_rate": 3.4987685308349384e-06, "loss": 0.90426815, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.5520761013031006 }, { "auxiliary_loss_clip": 0.01157722, "auxiliary_loss_mlp": 0.01038326, "balance_loss_clip": 1.05107427, "balance_loss_mlp": 1.02908134, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 2.0307845871271626, "language_loss": 0.61369467, "learning_rate": 3.4982526325020497e-06, "loss": 0.63565516, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.541287660598755 }, { "auxiliary_loss_clip": 0.01179013, "auxiliary_loss_mlp": 0.01038382, "balance_loss_clip": 1.0585525, "balance_loss_mlp": 1.02906537, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 2.049653496888196, "language_loss": 0.81709492, "learning_rate": 3.4977365068854273e-06, "loss": 0.8392688, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.5073964595794678 }, { "auxiliary_loss_clip": 0.01164415, "auxiliary_loss_mlp": 0.01033067, "balance_loss_clip": 1.05391324, "balance_loss_mlp": 1.02304125, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 1.7913571643324888, "language_loss": 0.73636186, "learning_rate": 3.4972201540633676e-06, "loss": 0.75833666, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.563694715499878 }, { "auxiliary_loss_clip": 0.01170028, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.05812955, "balance_loss_mlp": 1.02412033, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 1.8084924242117209, "language_loss": 0.85298598, "learning_rate": 3.4967035741142008e-06, "loss": 0.87502146, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.5316061973571777 }, { "auxiliary_loss_clip": 0.0117278, "auxiliary_loss_mlp": 0.01040226, "balance_loss_clip": 1.06622815, "balance_loss_mlp": 1.03164291, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 1.8243536631638415, "language_loss": 0.81574547, "learning_rate": 3.4961867671162917e-06, "loss": 0.83787549, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.587557554244995 }, { "auxiliary_loss_clip": 0.01205034, "auxiliary_loss_mlp": 0.01033504, "balance_loss_clip": 1.06028807, "balance_loss_mlp": 1.02365136, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 2.639936067526849, "language_loss": 0.77162588, "learning_rate": 3.4956697331480402e-06, "loss": 0.79401124, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.4816300868988037 }, { "auxiliary_loss_clip": 0.01167254, "auxiliary_loss_mlp": 0.01034429, "balance_loss_clip": 1.05588734, "balance_loss_mlp": 1.02552462, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 1.5278355720973897, "language_loss": 0.79945493, "learning_rate": 3.495152472287879e-06, "loss": 0.82147175, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.601029872894287 }, { "auxiliary_loss_clip": 0.01164769, "auxiliary_loss_mlp": 0.01038552, "balance_loss_clip": 1.05813777, "balance_loss_mlp": 1.02868724, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 1.8267900948925542, "language_loss": 0.73949993, "learning_rate": 3.4946349846142766e-06, "loss": 0.76153314, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.5854198932647705 }, { "auxiliary_loss_clip": 0.0120564, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.06076467, "balance_loss_mlp": 1.02137375, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 2.1520396583895334, "language_loss": 0.76209307, "learning_rate": 3.4941172702057353e-06, "loss": 0.78446031, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.477078914642334 }, { "auxiliary_loss_clip": 0.01179193, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.06077087, "balance_loss_mlp": 1.02341866, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 2.168707876738852, "language_loss": 0.80418956, "learning_rate": 3.4935993291407924e-06, "loss": 0.82631242, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 3.2881507873535156 }, { "auxiliary_loss_clip": 0.01172635, "auxiliary_loss_mlp": 0.01034101, "balance_loss_clip": 1.05657125, "balance_loss_mlp": 1.02478528, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.1901589668165258, "language_loss": 0.71054924, "learning_rate": 3.4930811614980183e-06, "loss": 0.7326166, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.574782371520996 }, { "auxiliary_loss_clip": 0.01185267, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.05944121, "balance_loss_mlp": 1.02800775, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.7464346767000507, "language_loss": 0.7960819, "learning_rate": 3.4925627673560198e-06, "loss": 0.81831819, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.5333878993988037 }, { "auxiliary_loss_clip": 0.01160651, "auxiliary_loss_mlp": 0.01033497, "balance_loss_clip": 1.05622363, "balance_loss_mlp": 1.02438354, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 2.0210532472431204, "language_loss": 0.88352448, "learning_rate": 3.4920441467934357e-06, "loss": 0.90546596, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 2.5967423915863037 }, { "auxiliary_loss_clip": 0.01155553, "auxiliary_loss_mlp": 0.01034812, "balance_loss_clip": 1.05687952, "balance_loss_mlp": 1.02560937, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 5.16058382535851, "language_loss": 0.82863748, "learning_rate": 3.491525299888941e-06, "loss": 0.85054111, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 2.6054956912994385 }, { "auxiliary_loss_clip": 0.01082217, "auxiliary_loss_mlp": 0.00753036, "balance_loss_clip": 1.04047155, "balance_loss_mlp": 1.00000405, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.9214712692686856, "language_loss": 0.62708187, "learning_rate": 3.491006226721244e-06, "loss": 0.64543438, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 3.9358623027801514 }, { "auxiliary_loss_clip": 0.01182365, "auxiliary_loss_mlp": 0.00762453, "balance_loss_clip": 1.06347585, "balance_loss_mlp": 1.00030303, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 2.104600949251835, "language_loss": 0.77562422, "learning_rate": 3.4904869273690882e-06, "loss": 0.79507244, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 2.5291388034820557 }, { "auxiliary_loss_clip": 0.01194659, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 1.05992293, "balance_loss_mlp": 1.02408862, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 1.926245537530003, "language_loss": 0.88723183, "learning_rate": 3.489967401911251e-06, "loss": 0.90951937, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 3.2687244415283203 }, { "auxiliary_loss_clip": 0.01208642, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.06253338, "balance_loss_mlp": 1.02859688, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.5987102161299322, "language_loss": 0.69592202, "learning_rate": 3.4894476504265428e-06, "loss": 0.71839368, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 2.6631603240966797 }, { "auxiliary_loss_clip": 0.01086103, "auxiliary_loss_mlp": 0.01001878, "balance_loss_clip": 1.02572727, "balance_loss_mlp": 0.99975616, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 0.7400381983311672, "language_loss": 0.54510289, "learning_rate": 3.4889276729938104e-06, "loss": 0.5659827, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 3.8222787380218506 }, { "auxiliary_loss_clip": 0.01172341, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.0580194, "balance_loss_mlp": 1.02580261, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 3.9151787878578794, "language_loss": 0.80676526, "learning_rate": 3.488407469691934e-06, "loss": 0.8288402, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 2.540405750274658 }, { "auxiliary_loss_clip": 0.01172698, "auxiliary_loss_mlp": 0.01037838, "balance_loss_clip": 1.05440748, "balance_loss_mlp": 1.02772927, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 2.036809592874757, "language_loss": 0.8094548, "learning_rate": 3.487887040599828e-06, "loss": 0.83156013, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 2.603438377380371 }, { "auxiliary_loss_clip": 0.01209265, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.06360817, "balance_loss_mlp": 1.02717423, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 2.6839999517770234, "language_loss": 0.76344538, "learning_rate": 3.4873663857964407e-06, "loss": 0.7859028, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.4817733764648438 }, { "auxiliary_loss_clip": 0.01147692, "auxiliary_loss_mlp": 0.01041692, "balance_loss_clip": 1.05758286, "balance_loss_mlp": 1.03275764, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 1.7994728708736876, "language_loss": 0.66438794, "learning_rate": 3.4868455053607556e-06, "loss": 0.6862818, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 2.613725185394287 }, { "auxiliary_loss_clip": 0.01195561, "auxiliary_loss_mlp": 0.01034167, "balance_loss_clip": 1.05935526, "balance_loss_mlp": 1.02393878, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 2.2093335964952305, "language_loss": 0.71906871, "learning_rate": 3.486324399371789e-06, "loss": 0.74136603, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.5071308612823486 }, { "auxiliary_loss_clip": 0.01159795, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.0591054, "balance_loss_mlp": 1.02213836, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 1.985262301160189, "language_loss": 0.78388047, "learning_rate": 3.485803067908593e-06, "loss": 0.80578148, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.5696043968200684 }, { "auxiliary_loss_clip": 0.01106925, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.04517937, "balance_loss_mlp": 1.02140033, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 2.177025844082378, "language_loss": 0.7928834, "learning_rate": 3.485281511050253e-06, "loss": 0.81426561, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 2.725576639175415 }, { "auxiliary_loss_clip": 0.0119225, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.05870152, "balance_loss_mlp": 1.02563739, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 2.500840084269615, "language_loss": 0.90160578, "learning_rate": 3.484759728875889e-06, "loss": 0.92387819, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.470620632171631 }, { "auxiliary_loss_clip": 0.011373, "auxiliary_loss_mlp": 0.01039782, "balance_loss_clip": 1.05258298, "balance_loss_mlp": 1.03037083, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 1.9506446087134908, "language_loss": 0.80931163, "learning_rate": 3.4842377214646543e-06, "loss": 0.83108246, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.573622703552246 }, { "auxiliary_loss_clip": 0.01205844, "auxiliary_loss_mlp": 0.01035308, "balance_loss_clip": 1.06353736, "balance_loss_mlp": 1.02668917, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 1.8315349626897552, "language_loss": 0.66627455, "learning_rate": 3.483715488895737e-06, "loss": 0.68868601, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.4838802814483643 }, { "auxiliary_loss_clip": 0.0114672, "auxiliary_loss_mlp": 0.01032545, "balance_loss_clip": 1.05340874, "balance_loss_mlp": 1.02355707, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 2.0158119872474765, "language_loss": 0.78894293, "learning_rate": 3.48319303124836e-06, "loss": 0.81073558, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.63031005859375 }, { "auxiliary_loss_clip": 0.01172604, "auxiliary_loss_mlp": 0.01033173, "balance_loss_clip": 1.06052518, "balance_loss_mlp": 1.02430952, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.793456369143242, "language_loss": 0.66679883, "learning_rate": 3.4826703486017798e-06, "loss": 0.6888566, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.582402467727661 }, { "auxiliary_loss_clip": 0.01190277, "auxiliary_loss_mlp": 0.0103021, "balance_loss_clip": 1.06154346, "balance_loss_mlp": 1.02091169, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 1.6031812941327097, "language_loss": 0.76937962, "learning_rate": 3.4821474410352867e-06, "loss": 0.79158449, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.5038199424743652 }, { "auxiliary_loss_clip": 0.01060706, "auxiliary_loss_mlp": 0.01007477, "balance_loss_clip": 1.02169466, "balance_loss_mlp": 1.00536728, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 0.9061539242779558, "language_loss": 0.62688702, "learning_rate": 3.481624308628205e-06, "loss": 0.64756888, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.2573835849761963 }, { "auxiliary_loss_clip": 0.01175275, "auxiliary_loss_mlp": 0.01031658, "balance_loss_clip": 1.05849123, "balance_loss_mlp": 1.02153158, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 2.556395930723889, "language_loss": 1.00268567, "learning_rate": 3.481100951459893e-06, "loss": 1.024755, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.503222703933716 }, { "auxiliary_loss_clip": 0.01187605, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.05879259, "balance_loss_mlp": 1.02869415, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 2.274219109170958, "language_loss": 0.78566885, "learning_rate": 3.4805773696097453e-06, "loss": 0.80791461, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.5410521030426025 }, { "auxiliary_loss_clip": 0.01166605, "auxiliary_loss_mlp": 0.01028364, "balance_loss_clip": 1.05726957, "balance_loss_mlp": 1.01969147, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 2.004631116964035, "language_loss": 0.8739922, "learning_rate": 3.4800535631571874e-06, "loss": 0.89594185, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 3.221395969390869 }, { "auxiliary_loss_clip": 0.0117922, "auxiliary_loss_mlp": 0.01033953, "balance_loss_clip": 1.05859828, "balance_loss_mlp": 1.02435052, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 1.936042699235944, "language_loss": 0.75970483, "learning_rate": 3.4795295321816804e-06, "loss": 0.78183663, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 2.5444693565368652 }, { "auxiliary_loss_clip": 0.01161884, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.05561686, "balance_loss_mlp": 1.02289844, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 2.0181284868841667, "language_loss": 0.91005611, "learning_rate": 3.47900527676272e-06, "loss": 0.93199944, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 2.507291078567505 }, { "auxiliary_loss_clip": 0.01207153, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.06362677, "balance_loss_mlp": 1.02353263, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 1.9621115680944143, "language_loss": 0.88024795, "learning_rate": 3.478480796979835e-06, "loss": 0.90264988, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.460406541824341 }, { "auxiliary_loss_clip": 0.0117092, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.05667269, "balance_loss_mlp": 1.02690005, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 1.6179682922230085, "language_loss": 0.7777583, "learning_rate": 3.4779560929125894e-06, "loss": 0.79982167, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 2.6158714294433594 }, { "auxiliary_loss_clip": 0.01059305, "auxiliary_loss_mlp": 0.01010434, "balance_loss_clip": 1.02169824, "balance_loss_mlp": 1.00833571, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.7028056811002855, "language_loss": 0.56916738, "learning_rate": 3.4774311646405783e-06, "loss": 0.58986473, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 3.2564096450805664 }, { "auxiliary_loss_clip": 0.01147708, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.05097175, "balance_loss_mlp": 1.02382851, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 1.9874780152607852, "language_loss": 0.83605719, "learning_rate": 3.476906012243435e-06, "loss": 0.85786235, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 3.3424301147460938 }, { "auxiliary_loss_clip": 0.01178557, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.05830884, "balance_loss_mlp": 1.02852476, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.5785337566136524, "language_loss": 0.8104434, "learning_rate": 3.476380635800824e-06, "loss": 0.832605, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 2.560530424118042 }, { "auxiliary_loss_clip": 0.01174491, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.06084287, "balance_loss_mlp": 1.02392197, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 2.188430144407419, "language_loss": 0.85692233, "learning_rate": 3.475855035392444e-06, "loss": 0.87899446, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 3.222622871398926 }, { "auxiliary_loss_clip": 0.01128042, "auxiliary_loss_mlp": 0.01040217, "balance_loss_clip": 1.052665, "balance_loss_mlp": 1.03158081, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 1.865161822656067, "language_loss": 0.71090758, "learning_rate": 3.475329211098029e-06, "loss": 0.73259008, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 3.640693426132202 }, { "auxiliary_loss_clip": 0.01150272, "auxiliary_loss_mlp": 0.01035775, "balance_loss_clip": 1.05565381, "balance_loss_mlp": 1.02696538, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 2.0458219043637524, "language_loss": 0.82278407, "learning_rate": 3.4748031629973453e-06, "loss": 0.84464455, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 2.6299588680267334 }, { "auxiliary_loss_clip": 0.01038693, "auxiliary_loss_mlp": 0.01001958, "balance_loss_clip": 1.01459205, "balance_loss_mlp": 0.99977684, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 0.9211520584524355, "language_loss": 0.56637025, "learning_rate": 3.4742768911701944e-06, "loss": 0.58677673, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 3.261922597885132 }, { "auxiliary_loss_clip": 0.01193268, "auxiliary_loss_mlp": 0.01046639, "balance_loss_clip": 1.06346941, "balance_loss_mlp": 1.03632176, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 3.697315383357547, "language_loss": 0.70035362, "learning_rate": 3.4737503956964113e-06, "loss": 0.72275269, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.4388608932495117 }, { "auxiliary_loss_clip": 0.01166998, "auxiliary_loss_mlp": 0.01030629, "balance_loss_clip": 1.05508125, "balance_loss_mlp": 1.02065122, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 2.2188666164800503, "language_loss": 0.66898608, "learning_rate": 3.473223676655865e-06, "loss": 0.69096237, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 2.4890077114105225 }, { "auxiliary_loss_clip": 0.01166242, "auxiliary_loss_mlp": 0.01034256, "balance_loss_clip": 1.05243301, "balance_loss_mlp": 1.02449894, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 1.8775092978324723, "language_loss": 0.79973745, "learning_rate": 3.472696734128459e-06, "loss": 0.82174242, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.5076100826263428 }, { "auxiliary_loss_clip": 0.01191141, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.06105661, "balance_loss_mlp": 1.02634287, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 1.7017729438129465, "language_loss": 0.75454652, "learning_rate": 3.4721695681941286e-06, "loss": 0.77681094, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.519624710083008 }, { "auxiliary_loss_clip": 0.01174461, "auxiliary_loss_mlp": 0.00762126, "balance_loss_clip": 1.05773032, "balance_loss_mlp": 1.00029922, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 1.980150612696572, "language_loss": 0.82714975, "learning_rate": 3.471642178932845e-06, "loss": 0.8465156, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.5327861309051514 }, { "auxiliary_loss_clip": 0.01174649, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.05456042, "balance_loss_mlp": 1.02507758, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 2.2367871869853295, "language_loss": 0.89613599, "learning_rate": 3.471114566424613e-06, "loss": 0.91822582, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.520111560821533 }, { "auxiliary_loss_clip": 0.01173615, "auxiliary_loss_mlp": 0.01037601, "balance_loss_clip": 1.05972505, "balance_loss_mlp": 1.02843332, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 3.4019081036117758, "language_loss": 0.75799209, "learning_rate": 3.4705867307494715e-06, "loss": 0.78010428, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.5772922039031982 }, { "auxiliary_loss_clip": 0.01190555, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.05747652, "balance_loss_mlp": 1.02409315, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 2.5485588748149866, "language_loss": 0.84810233, "learning_rate": 3.470058671987492e-06, "loss": 0.87034631, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.489537477493286 }, { "auxiliary_loss_clip": 0.01192774, "auxiliary_loss_mlp": 0.01036866, "balance_loss_clip": 1.05773246, "balance_loss_mlp": 1.02716267, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 2.9041781916593954, "language_loss": 0.84432018, "learning_rate": 3.4695303902187805e-06, "loss": 0.86661661, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.6259748935699463 }, { "auxiliary_loss_clip": 0.01153807, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.05318832, "balance_loss_mlp": 1.02606547, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 1.9600515373652982, "language_loss": 0.78715575, "learning_rate": 3.469001885523478e-06, "loss": 0.80904412, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.6274774074554443 }, { "auxiliary_loss_clip": 0.01204329, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.06091666, "balance_loss_mlp": 1.02569318, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.5581847010125418, "language_loss": 0.81077945, "learning_rate": 3.4684731579817568e-06, "loss": 0.83317155, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.544976234436035 }, { "auxiliary_loss_clip": 0.0112493, "auxiliary_loss_mlp": 0.0103822, "balance_loss_clip": 1.05285692, "balance_loss_mlp": 1.02946448, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 1.9586428654441055, "language_loss": 0.7680524, "learning_rate": 3.4679442076738247e-06, "loss": 0.78968394, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.6525888442993164 }, { "auxiliary_loss_clip": 0.01206663, "auxiliary_loss_mlp": 0.01035429, "balance_loss_clip": 1.06308985, "balance_loss_mlp": 1.02526641, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 1.7606547668052164, "language_loss": 0.83506858, "learning_rate": 3.4674150346799245e-06, "loss": 0.85748953, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.51381254196167 }, { "auxiliary_loss_clip": 0.01172954, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.05756521, "balance_loss_mlp": 1.02329564, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 2.5088389714281245, "language_loss": 0.80450433, "learning_rate": 3.4668856390803295e-06, "loss": 0.82655424, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.494565963745117 }, { "auxiliary_loss_clip": 0.01175939, "auxiliary_loss_mlp": 0.01037079, "balance_loss_clip": 1.05661988, "balance_loss_mlp": 1.02728021, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 1.8443108956758443, "language_loss": 0.89834225, "learning_rate": 3.4663560209553495e-06, "loss": 0.92047238, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 3.2861616611480713 }, { "auxiliary_loss_clip": 0.01164631, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.05582082, "balance_loss_mlp": 1.026088, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.623082782506732, "language_loss": 0.79230714, "learning_rate": 3.4658261803853267e-06, "loss": 0.81430411, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 2.548447847366333 }, { "auxiliary_loss_clip": 0.01169157, "auxiliary_loss_mlp": 0.01030095, "balance_loss_clip": 1.05906343, "balance_loss_mlp": 1.02044463, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 3.7303626463122104, "language_loss": 0.80973983, "learning_rate": 3.4652961174506383e-06, "loss": 0.83173233, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 2.554018259048462 }, { "auxiliary_loss_clip": 0.01073455, "auxiliary_loss_mlp": 0.01002989, "balance_loss_clip": 1.01766753, "balance_loss_mlp": 1.00090301, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9765485020574719, "language_loss": 0.58150864, "learning_rate": 3.464765832231694e-06, "loss": 0.60227311, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 3.121633529663086 }, { "auxiliary_loss_clip": 0.01191774, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.06181264, "balance_loss_mlp": 1.02800119, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 3.7741997148089825, "language_loss": 0.70536321, "learning_rate": 3.4642353248089373e-06, "loss": 0.72765017, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.508643388748169 }, { "auxiliary_loss_clip": 0.01166922, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.05439675, "balance_loss_mlp": 1.02449787, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 2.240772286644814, "language_loss": 0.80300713, "learning_rate": 3.463704595262846e-06, "loss": 0.82501596, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 2.6237759590148926 }, { "auxiliary_loss_clip": 0.01156064, "auxiliary_loss_mlp": 0.01032682, "balance_loss_clip": 1.05527639, "balance_loss_mlp": 1.02354431, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 2.201421375841158, "language_loss": 0.70782, "learning_rate": 3.463173643673931e-06, "loss": 0.72970742, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 3.418454170227051 }, { "auxiliary_loss_clip": 0.010813, "auxiliary_loss_mlp": 0.01005249, "balance_loss_clip": 1.01859689, "balance_loss_mlp": 1.00312757, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.8965113123203803, "language_loss": 0.63509458, "learning_rate": 3.4626424701227387e-06, "loss": 0.65596008, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 3.707408905029297 }, { "auxiliary_loss_clip": 0.01090076, "auxiliary_loss_mlp": 0.01004961, "balance_loss_clip": 1.01870632, "balance_loss_mlp": 1.00292289, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8167687227736299, "language_loss": 0.55761981, "learning_rate": 3.4621110746898452e-06, "loss": 0.57857019, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 3.215350389480591 }, { "auxiliary_loss_clip": 0.0119003, "auxiliary_loss_mlp": 0.01034766, "balance_loss_clip": 1.05866778, "balance_loss_mlp": 1.02628374, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 1.80968952913865, "language_loss": 0.74861968, "learning_rate": 3.4615794574558654e-06, "loss": 0.77086765, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 3.482170820236206 }, { "auxiliary_loss_clip": 0.01174945, "auxiliary_loss_mlp": 0.01033176, "balance_loss_clip": 1.05689931, "balance_loss_mlp": 1.02428293, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 2.3522861750394815, "language_loss": 0.84010243, "learning_rate": 3.4610476185014436e-06, "loss": 0.86218363, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 2.518636703491211 }, { "auxiliary_loss_clip": 0.01204751, "auxiliary_loss_mlp": 0.01042658, "balance_loss_clip": 1.06116176, "balance_loss_mlp": 1.03303826, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 1.8094181141048817, "language_loss": 0.79075956, "learning_rate": 3.4605155579072597e-06, "loss": 0.81323361, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 2.5096688270568848 }, { "auxiliary_loss_clip": 0.01134629, "auxiliary_loss_mlp": 0.01033761, "balance_loss_clip": 1.04964185, "balance_loss_mlp": 1.02476668, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 1.8425704283932451, "language_loss": 0.70879638, "learning_rate": 3.459983275754027e-06, "loss": 0.73048031, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.6323044300079346 }, { "auxiliary_loss_clip": 0.0120241, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.05937922, "balance_loss_mlp": 1.02758002, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 2.1592073188038134, "language_loss": 0.79753935, "learning_rate": 3.4594507721224918e-06, "loss": 0.8199302, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 2.485175609588623 }, { "auxiliary_loss_clip": 0.01175287, "auxiliary_loss_mlp": 0.01036619, "balance_loss_clip": 1.05538905, "balance_loss_mlp": 1.02816164, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 1.6267557995845998, "language_loss": 0.81657732, "learning_rate": 3.4589180470934353e-06, "loss": 0.83869636, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.5769431591033936 }, { "auxiliary_loss_clip": 0.01192703, "auxiliary_loss_mlp": 0.01032175, "balance_loss_clip": 1.05643451, "balance_loss_mlp": 1.02199447, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 1.778481225118753, "language_loss": 0.76767683, "learning_rate": 3.4583851007476713e-06, "loss": 0.78992563, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.503451108932495 }, { "auxiliary_loss_clip": 0.01161679, "auxiliary_loss_mlp": 0.01040064, "balance_loss_clip": 1.05686283, "balance_loss_mlp": 1.03090835, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 2.2549744231999247, "language_loss": 0.68717819, "learning_rate": 3.4578519331660464e-06, "loss": 0.70919561, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.556922674179077 }, { "auxiliary_loss_clip": 0.01184793, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.06004071, "balance_loss_mlp": 1.02288401, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 2.1806014955942987, "language_loss": 0.81868315, "learning_rate": 3.4573185444294426e-06, "loss": 0.84085023, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.482633113861084 }, { "auxiliary_loss_clip": 0.01175543, "auxiliary_loss_mlp": 0.00762761, "balance_loss_clip": 1.05780888, "balance_loss_mlp": 1.00025761, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 1.7147972737176664, "language_loss": 0.78867865, "learning_rate": 3.456784934618774e-06, "loss": 0.80806172, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.550771713256836 }, { "auxiliary_loss_clip": 0.01168735, "auxiliary_loss_mlp": 0.01034199, "balance_loss_clip": 1.05375648, "balance_loss_mlp": 1.02525806, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 2.1884027256068688, "language_loss": 0.80210429, "learning_rate": 3.4562511038149897e-06, "loss": 0.82413363, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.5027947425842285 }, { "auxiliary_loss_clip": 0.01042226, "auxiliary_loss_mlp": 0.01009366, "balance_loss_clip": 1.01682734, "balance_loss_mlp": 1.0073632, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.9765564595558546, "language_loss": 0.57744515, "learning_rate": 3.4557170520990705e-06, "loss": 0.59796107, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.217667818069458 }, { "auxiliary_loss_clip": 0.01184914, "auxiliary_loss_mlp": 0.01033823, "balance_loss_clip": 1.05888677, "balance_loss_mlp": 1.02524626, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 1.4555542264737897, "language_loss": 0.8652705, "learning_rate": 3.4551827795520324e-06, "loss": 0.88745791, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.5665106773376465 }, { "auxiliary_loss_clip": 0.01185921, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.055668, "balance_loss_mlp": 1.02704597, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 1.759495401317626, "language_loss": 0.84780812, "learning_rate": 3.4546482862549226e-06, "loss": 0.87002647, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.511434555053711 }, { "auxiliary_loss_clip": 0.0115198, "auxiliary_loss_mlp": 0.01030346, "balance_loss_clip": 1.05241954, "balance_loss_mlp": 1.02182293, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 2.1794616290049347, "language_loss": 0.78806412, "learning_rate": 3.4541135722888253e-06, "loss": 0.80988735, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.5696299076080322 }, { "auxiliary_loss_clip": 0.01200368, "auxiliary_loss_mlp": 0.01029409, "balance_loss_clip": 1.05842614, "balance_loss_mlp": 1.02102304, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 1.6539957983333218, "language_loss": 0.80064142, "learning_rate": 3.453578637734854e-06, "loss": 0.82293916, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.5336151123046875 }, { "auxiliary_loss_clip": 0.01204618, "auxiliary_loss_mlp": 0.01037707, "balance_loss_clip": 1.06360078, "balance_loss_mlp": 1.02904081, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 1.7393670323153714, "language_loss": 0.78529447, "learning_rate": 3.4530434826741605e-06, "loss": 0.80771774, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.51389217376709 }, { "auxiliary_loss_clip": 0.01168917, "auxiliary_loss_mlp": 0.01031761, "balance_loss_clip": 1.05755568, "balance_loss_mlp": 1.02373266, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 1.61060983508782, "language_loss": 0.68563664, "learning_rate": 3.452508107187926e-06, "loss": 0.70764345, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 3.461779832839966 }, { "auxiliary_loss_clip": 0.01127693, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.04775918, "balance_loss_mlp": 1.02770305, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 1.9060793781494343, "language_loss": 0.7721808, "learning_rate": 3.451972511357366e-06, "loss": 0.79382885, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.637789487838745 }, { "auxiliary_loss_clip": 0.01181461, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.05806315, "balance_loss_mlp": 1.0226872, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 1.7579904093483376, "language_loss": 0.85398436, "learning_rate": 3.45143669526373e-06, "loss": 0.87611306, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.522179126739502 }, { "auxiliary_loss_clip": 0.0106985, "auxiliary_loss_mlp": 0.01002121, "balance_loss_clip": 1.01727867, "balance_loss_mlp": 1.00005913, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 0.8164346032526584, "language_loss": 0.63189256, "learning_rate": 3.450900658988302e-06, "loss": 0.65261221, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.071333885192871 }, { "auxiliary_loss_clip": 0.01163374, "auxiliary_loss_mlp": 0.01035219, "balance_loss_clip": 1.05589855, "balance_loss_mlp": 1.02564692, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 1.9279970129618336, "language_loss": 0.77786541, "learning_rate": 3.450364402612397e-06, "loss": 0.79985136, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 2.57719349861145 }, { "auxiliary_loss_clip": 0.011683, "auxiliary_loss_mlp": 0.01038853, "balance_loss_clip": 1.05233419, "balance_loss_mlp": 1.03029335, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 1.875680228160289, "language_loss": 0.83933878, "learning_rate": 3.449827926217366e-06, "loss": 0.86141026, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 2.551110029220581 }, { "auxiliary_loss_clip": 0.0117148, "auxiliary_loss_mlp": 0.01032795, "balance_loss_clip": 1.05066216, "balance_loss_mlp": 1.02430153, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 3.563511531904457, "language_loss": 0.8042897, "learning_rate": 3.449291229884591e-06, "loss": 0.82633245, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 3.3992879390716553 }, { "auxiliary_loss_clip": 0.01159255, "auxiliary_loss_mlp": 0.01035775, "balance_loss_clip": 1.05328548, "balance_loss_mlp": 1.02681684, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 1.7124917921102993, "language_loss": 0.86763954, "learning_rate": 3.4487543136954887e-06, "loss": 0.88958985, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 3.268906354904175 }, { "auxiliary_loss_clip": 0.01155705, "auxiliary_loss_mlp": 0.01033708, "balance_loss_clip": 1.0537703, "balance_loss_mlp": 1.02491069, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 1.7005070816022452, "language_loss": 0.91251469, "learning_rate": 3.448217177731509e-06, "loss": 0.93440878, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 2.61950945854187 }, { "auxiliary_loss_clip": 0.01166272, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.05586767, "balance_loss_mlp": 1.02946115, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 2.7780153602186917, "language_loss": 0.77702719, "learning_rate": 3.4476798220741348e-06, "loss": 0.79907411, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 3.3384013175964355 }, { "auxiliary_loss_clip": 0.01202609, "auxiliary_loss_mlp": 0.01033691, "balance_loss_clip": 1.06305718, "balance_loss_mlp": 1.02554309, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.7720048109130149, "language_loss": 0.78223443, "learning_rate": 3.4471422468048826e-06, "loss": 0.8045975, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 2.451143264770508 }, { "auxiliary_loss_clip": 0.01177753, "auxiliary_loss_mlp": 0.01034461, "balance_loss_clip": 1.05757999, "balance_loss_mlp": 1.02479911, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 2.5366575595775043, "language_loss": 0.73327482, "learning_rate": 3.4466044520053022e-06, "loss": 0.75539696, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 2.539808511734009 }, { "auxiliary_loss_clip": 0.01156834, "auxiliary_loss_mlp": 0.01037486, "balance_loss_clip": 1.05093551, "balance_loss_mlp": 1.02886701, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 1.7545379625084028, "language_loss": 0.60538185, "learning_rate": 3.446066437756977e-06, "loss": 0.627325, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.5168755054473877 }, { "auxiliary_loss_clip": 0.01170691, "auxiliary_loss_mlp": 0.01027802, "balance_loss_clip": 1.05587602, "balance_loss_mlp": 1.01873577, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 2.279722453515776, "language_loss": 0.75347096, "learning_rate": 3.4455282041415224e-06, "loss": 0.77545589, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.5534069538116455 }, { "auxiliary_loss_clip": 0.01155693, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.05290401, "balance_loss_mlp": 1.02581668, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 2.0280871493829205, "language_loss": 0.87160403, "learning_rate": 3.4449897512405894e-06, "loss": 0.89350647, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 2.598217725753784 }, { "auxiliary_loss_clip": 0.01120118, "auxiliary_loss_mlp": 0.00762592, "balance_loss_clip": 1.04640937, "balance_loss_mlp": 1.00027514, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 2.004827955253511, "language_loss": 0.75474727, "learning_rate": 3.444451079135859e-06, "loss": 0.77357441, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.6826331615448 }, { "auxiliary_loss_clip": 0.01134073, "auxiliary_loss_mlp": 0.00762596, "balance_loss_clip": 1.05016208, "balance_loss_mlp": 1.00026143, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 1.8568523219838553, "language_loss": 0.74154055, "learning_rate": 3.4439121879090493e-06, "loss": 0.76050729, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.6048130989074707 }, { "auxiliary_loss_clip": 0.01175681, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.05619442, "balance_loss_mlp": 1.02624631, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 2.072414500629902, "language_loss": 0.83193982, "learning_rate": 3.4433730776419082e-06, "loss": 0.85404718, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.559068202972412 }, { "auxiliary_loss_clip": 0.01188343, "auxiliary_loss_mlp": 0.00763351, "balance_loss_clip": 1.05639458, "balance_loss_mlp": 1.00027752, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 2.1545023116287756, "language_loss": 0.79959881, "learning_rate": 3.4428337484162183e-06, "loss": 0.81911576, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.5759618282318115 }, { "auxiliary_loss_clip": 0.01165226, "auxiliary_loss_mlp": 0.01036215, "balance_loss_clip": 1.05147755, "balance_loss_mlp": 1.02721488, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 1.726396583592112, "language_loss": 0.84172571, "learning_rate": 3.442294200313797e-06, "loss": 0.86374009, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.5511763095855713 }, { "auxiliary_loss_clip": 0.0109063, "auxiliary_loss_mlp": 0.0100209, "balance_loss_clip": 1.0204258, "balance_loss_mlp": 1.0001353, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.7662825368767422, "language_loss": 0.52735877, "learning_rate": 3.4417544334164916e-06, "loss": 0.54828596, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.1320981979370117 }, { "auxiliary_loss_clip": 0.01155642, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.05660963, "balance_loss_mlp": 1.02309561, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 1.7488988716850495, "language_loss": 0.77387643, "learning_rate": 3.4412144478061854e-06, "loss": 0.79575014, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.5900399684906006 }, { "auxiliary_loss_clip": 0.01097053, "auxiliary_loss_mlp": 0.01037391, "balance_loss_clip": 1.04836535, "balance_loss_mlp": 1.02795517, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 1.846033361114347, "language_loss": 0.75477493, "learning_rate": 3.4406742435647925e-06, "loss": 0.77611935, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 2.7240214347839355 }, { "auxiliary_loss_clip": 0.01185377, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 1.0600822, "balance_loss_mlp": 1.02222633, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 2.095684102872311, "language_loss": 0.79295325, "learning_rate": 3.440133820774263e-06, "loss": 0.8151089, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.5479087829589844 }, { "auxiliary_loss_clip": 0.01176551, "auxiliary_loss_mlp": 0.01037219, "balance_loss_clip": 1.05910158, "balance_loss_mlp": 1.02829003, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 3.523812715907441, "language_loss": 0.81593776, "learning_rate": 3.439593179516578e-06, "loss": 0.83807552, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.6019537448883057 }, { "auxiliary_loss_clip": 0.01176928, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.05759001, "balance_loss_mlp": 1.0264585, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 2.1459535063176416, "language_loss": 0.81082463, "learning_rate": 3.4390523198737524e-06, "loss": 0.83295017, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.5908679962158203 }, { "auxiliary_loss_clip": 0.01202081, "auxiliary_loss_mlp": 0.00762496, "balance_loss_clip": 1.05988002, "balance_loss_mlp": 1.00026941, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 2.7338498093676127, "language_loss": 0.73497987, "learning_rate": 3.4385112419278333e-06, "loss": 0.75462568, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 3.2499656677246094 }, { "auxiliary_loss_clip": 0.01080318, "auxiliary_loss_mlp": 0.01004256, "balance_loss_clip": 1.01966047, "balance_loss_mlp": 1.00222993, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7942183606681902, "language_loss": 0.64862704, "learning_rate": 3.4379699457609033e-06, "loss": 0.66947275, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 2.9545304775238037 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.0103281, "balance_loss_clip": 1.05257583, "balance_loss_mlp": 1.02338028, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 1.7933627484966441, "language_loss": 0.90281963, "learning_rate": 3.4374284314550755e-06, "loss": 0.92476928, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.539372444152832 }, { "auxiliary_loss_clip": 0.01201072, "auxiliary_loss_mlp": 0.01036606, "balance_loss_clip": 1.06068206, "balance_loss_mlp": 1.02777791, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 1.8478705746315853, "language_loss": 0.81301486, "learning_rate": 3.436886699092498e-06, "loss": 0.83539164, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 2.4700374603271484 }, { "auxiliary_loss_clip": 0.01204834, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.06108308, "balance_loss_mlp": 1.03252482, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 7.787948792245968, "language_loss": 0.7133162, "learning_rate": 3.4363447487553502e-06, "loss": 0.73578155, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.432028293609619 }, { "auxiliary_loss_clip": 0.01168204, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.05713797, "balance_loss_mlp": 1.0320133, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 1.706422194811565, "language_loss": 0.78235567, "learning_rate": 3.4358025805258455e-06, "loss": 0.8044495, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 2.576878070831299 }, { "auxiliary_loss_clip": 0.01146194, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.05208302, "balance_loss_mlp": 1.02156126, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 2.512918619935285, "language_loss": 0.83910155, "learning_rate": 3.435260194486232e-06, "loss": 0.86086828, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 3.3695085048675537 }, { "auxiliary_loss_clip": 0.01171737, "auxiliary_loss_mlp": 0.01031217, "balance_loss_clip": 1.05812919, "balance_loss_mlp": 1.02218127, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 2.103098614691949, "language_loss": 0.82344294, "learning_rate": 3.4347175907187875e-06, "loss": 0.84547246, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 3.245657205581665 }, { "auxiliary_loss_clip": 0.01185765, "auxiliary_loss_mlp": 0.01036729, "balance_loss_clip": 1.05919456, "balance_loss_mlp": 1.02764511, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 1.9127866493973917, "language_loss": 0.87940794, "learning_rate": 3.4341747693058254e-06, "loss": 0.90163291, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 3.347287178039551 }, { "auxiliary_loss_clip": 0.01090477, "auxiliary_loss_mlp": 0.01035548, "balance_loss_clip": 1.04763794, "balance_loss_mlp": 1.02666128, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 1.9519237522537427, "language_loss": 0.7702868, "learning_rate": 3.4336317303296916e-06, "loss": 0.79154706, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 2.8360073566436768 }, { "auxiliary_loss_clip": 0.01182477, "auxiliary_loss_mlp": 0.01033374, "balance_loss_clip": 1.05707824, "balance_loss_mlp": 1.02437949, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.674670987534941, "language_loss": 0.75453848, "learning_rate": 3.4330884738727635e-06, "loss": 0.77669698, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 2.686034679412842 }, { "auxiliary_loss_clip": 0.01134557, "auxiliary_loss_mlp": 0.01035486, "balance_loss_clip": 1.05386961, "balance_loss_mlp": 1.0264914, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 1.8276510882690735, "language_loss": 0.70535713, "learning_rate": 3.4325450000174535e-06, "loss": 0.72705758, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 2.592045783996582 }, { "auxiliary_loss_clip": 0.01133252, "auxiliary_loss_mlp": 0.01035786, "balance_loss_clip": 1.05029309, "balance_loss_mlp": 1.02656555, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 1.6634343017392166, "language_loss": 0.73888707, "learning_rate": 3.4320013088462067e-06, "loss": 0.76057744, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.592363119125366 }, { "auxiliary_loss_clip": 0.01158895, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.05302835, "balance_loss_mlp": 1.02574217, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.436555645751939, "language_loss": 0.81479031, "learning_rate": 3.431457400441499e-06, "loss": 0.83672649, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.5742905139923096 }, { "auxiliary_loss_clip": 0.01020247, "auxiliary_loss_mlp": 0.01002051, "balance_loss_clip": 1.01335192, "balance_loss_mlp": 0.99990541, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 0.9227296692943235, "language_loss": 0.6093812, "learning_rate": 3.4309132748858424e-06, "loss": 0.62960422, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 3.2282798290252686 }, { "auxiliary_loss_clip": 0.0118394, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.05926514, "balance_loss_mlp": 1.02645683, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 1.6546448491307526, "language_loss": 0.83487093, "learning_rate": 3.430368932261779e-06, "loss": 0.85705996, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 2.540611743927002 }, { "auxiliary_loss_clip": 0.01170503, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.05815756, "balance_loss_mlp": 1.02480936, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 1.8312946505086265, "language_loss": 0.75157845, "learning_rate": 3.429824372651886e-06, "loss": 0.77362132, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.505087375640869 }, { "auxiliary_loss_clip": 0.01152107, "auxiliary_loss_mlp": 0.0102688, "balance_loss_clip": 1.05426788, "balance_loss_mlp": 1.01778483, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 2.878217044745344, "language_loss": 0.83077705, "learning_rate": 3.4292795961387732e-06, "loss": 0.8525669, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.6017048358917236 }, { "auxiliary_loss_clip": 0.01198014, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.05739641, "balance_loss_mlp": 1.0250268, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.3134215468665142, "language_loss": 0.87339002, "learning_rate": 3.4287346028050818e-06, "loss": 0.8957094, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.4366824626922607 }, { "auxiliary_loss_clip": 0.01172902, "auxiliary_loss_mlp": 0.01036305, "balance_loss_clip": 1.05716515, "balance_loss_mlp": 1.02790058, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.4729490659547153, "language_loss": 0.79646218, "learning_rate": 3.4281893927334866e-06, "loss": 0.81855428, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.5557057857513428 }, { "auxiliary_loss_clip": 0.01187268, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.05938363, "balance_loss_mlp": 1.02850246, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 1.9532074946406646, "language_loss": 0.75155002, "learning_rate": 3.4276439660066963e-06, "loss": 0.77379596, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.5369086265563965 }, { "auxiliary_loss_clip": 0.01196002, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.05810678, "balance_loss_mlp": 1.02357364, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 2.6956873230163003, "language_loss": 0.84165609, "learning_rate": 3.427098322707452e-06, "loss": 0.86393923, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.4258689880371094 }, { "auxiliary_loss_clip": 0.01184912, "auxiliary_loss_mlp": 0.01038653, "balance_loss_clip": 1.06041396, "balance_loss_mlp": 1.02965844, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 1.9891821945076495, "language_loss": 0.89422244, "learning_rate": 3.426552462918526e-06, "loss": 0.91645813, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.467433214187622 }, { "auxiliary_loss_clip": 0.01202326, "auxiliary_loss_mlp": 0.01038708, "balance_loss_clip": 1.06314254, "balance_loss_mlp": 1.02991068, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 2.320289609288705, "language_loss": 0.73065478, "learning_rate": 3.426006386722726e-06, "loss": 0.75306511, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.4437308311462402 }, { "auxiliary_loss_clip": 0.01159172, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.0578295, "balance_loss_mlp": 1.02324522, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 1.9242404232551598, "language_loss": 0.92233354, "learning_rate": 3.4254600942028914e-06, "loss": 0.94424254, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.563342809677124 }, { "auxiliary_loss_clip": 0.01166524, "auxiliary_loss_mlp": 0.01028229, "balance_loss_clip": 1.05758905, "balance_loss_mlp": 1.0200932, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 2.1349041809808127, "language_loss": 0.82637435, "learning_rate": 3.424913585441893e-06, "loss": 0.84832191, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 2.49953293800354 }, { "auxiliary_loss_clip": 0.01181778, "auxiliary_loss_mlp": 0.01039228, "balance_loss_clip": 1.05804133, "balance_loss_mlp": 1.0301739, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 1.8711787808860212, "language_loss": 0.87162197, "learning_rate": 3.4243668605226374e-06, "loss": 0.89383203, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 3.2924299240112305 }, { "auxiliary_loss_clip": 0.01157603, "auxiliary_loss_mlp": 0.01035154, "balance_loss_clip": 1.05695117, "balance_loss_mlp": 1.02496755, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 2.252389206973943, "language_loss": 0.82687432, "learning_rate": 3.423819919528061e-06, "loss": 0.84880185, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.545948028564453 }, { "auxiliary_loss_clip": 0.01145636, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 1.05055404, "balance_loss_mlp": 1.01962662, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 1.7786542082656838, "language_loss": 0.77917504, "learning_rate": 3.4232727625411355e-06, "loss": 0.80091441, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 2.62650203704834 }, { "auxiliary_loss_clip": 0.01122464, "auxiliary_loss_mlp": 0.01033659, "balance_loss_clip": 1.05063248, "balance_loss_mlp": 1.02586234, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.6614578282037553, "language_loss": 0.86242998, "learning_rate": 3.4227253896448626e-06, "loss": 0.88399124, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 2.6431922912597656 }, { "auxiliary_loss_clip": 0.01194589, "auxiliary_loss_mlp": 0.01031475, "balance_loss_clip": 1.05645943, "balance_loss_mlp": 1.02307057, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 3.866183862387396, "language_loss": 0.82192552, "learning_rate": 3.42217780092228e-06, "loss": 0.84418619, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 2.540240526199341 }, { "auxiliary_loss_clip": 0.01063102, "auxiliary_loss_mlp": 0.01006106, "balance_loss_clip": 1.02039075, "balance_loss_mlp": 1.00398433, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 1.075973138957363, "language_loss": 0.60361457, "learning_rate": 3.421629996456456e-06, "loss": 0.62430668, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.0880115032196045 }, { "auxiliary_loss_clip": 0.01185656, "auxiliary_loss_mlp": 0.01028135, "balance_loss_clip": 1.05892622, "balance_loss_mlp": 1.01928425, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 2.510374713282056, "language_loss": 0.82581669, "learning_rate": 3.421081976330491e-06, "loss": 0.84795463, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 3.3169963359832764 }, { "auxiliary_loss_clip": 0.01167592, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.05479491, "balance_loss_mlp": 1.02164793, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 1.8386960874784468, "language_loss": 0.87921578, "learning_rate": 3.4205337406275207e-06, "loss": 0.90119809, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 3.5977959632873535 }, { "auxiliary_loss_clip": 0.01196911, "auxiliary_loss_mlp": 0.01029099, "balance_loss_clip": 1.05784917, "balance_loss_mlp": 1.02028942, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 2.4170129637831548, "language_loss": 0.75643092, "learning_rate": 3.4199852894307114e-06, "loss": 0.77869105, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 3.2933542728424072 }, { "auxiliary_loss_clip": 0.01125728, "auxiliary_loss_mlp": 0.01036609, "balance_loss_clip": 1.05161619, "balance_loss_mlp": 1.02765667, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 2.0643096880949163, "language_loss": 0.78901768, "learning_rate": 3.419436622823262e-06, "loss": 0.81064099, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 2.7175052165985107 }, { "auxiliary_loss_clip": 0.01168734, "auxiliary_loss_mlp": 0.0103346, "balance_loss_clip": 1.05622458, "balance_loss_mlp": 1.02514529, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 1.5982451878459463, "language_loss": 0.7428233, "learning_rate": 3.4188877408884063e-06, "loss": 0.76484525, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 2.575711250305176 }, { "auxiliary_loss_clip": 0.01162188, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.05368352, "balance_loss_mlp": 1.02482224, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 2.447584437659268, "language_loss": 0.65511882, "learning_rate": 3.4183386437094088e-06, "loss": 0.67707884, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 2.551903009414673 }, { "auxiliary_loss_clip": 0.01167976, "auxiliary_loss_mlp": 0.01031545, "balance_loss_clip": 1.05380964, "balance_loss_mlp": 1.02256894, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 2.355237988514992, "language_loss": 0.82321292, "learning_rate": 3.417789331369565e-06, "loss": 0.84520811, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.5333566665649414 }, { "auxiliary_loss_clip": 0.01203283, "auxiliary_loss_mlp": 0.01035272, "balance_loss_clip": 1.06136799, "balance_loss_mlp": 1.02594376, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 2.027773563450376, "language_loss": 0.91154671, "learning_rate": 3.4172398039522088e-06, "loss": 0.9339323, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.5839219093322754 }, { "auxiliary_loss_clip": 0.01183076, "auxiliary_loss_mlp": 0.0103808, "balance_loss_clip": 1.0546602, "balance_loss_mlp": 1.0291574, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 1.602194166732769, "language_loss": 0.79815626, "learning_rate": 3.4166900615407e-06, "loss": 0.82036781, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.5713634490966797 }, { "auxiliary_loss_clip": 0.01181644, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.05782342, "balance_loss_mlp": 1.02604365, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 3.7549789484977882, "language_loss": 0.74882424, "learning_rate": 3.416140104218436e-06, "loss": 0.77099073, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.602962017059326 }, { "auxiliary_loss_clip": 0.01068012, "auxiliary_loss_mlp": 0.00753123, "balance_loss_clip": 1.01964021, "balance_loss_mlp": 1.00008857, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 0.8481393731686929, "language_loss": 0.69674075, "learning_rate": 3.4155899320688437e-06, "loss": 0.71495211, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.2006149291992188 }, { "auxiliary_loss_clip": 0.01131867, "auxiliary_loss_mlp": 0.01036602, "balance_loss_clip": 1.05524397, "balance_loss_mlp": 1.02785814, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 2.1999569119108955, "language_loss": 0.74411392, "learning_rate": 3.415039545175384e-06, "loss": 0.76579857, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.59149169921875 }, { "auxiliary_loss_clip": 0.01183574, "auxiliary_loss_mlp": 0.01031228, "balance_loss_clip": 1.05618286, "balance_loss_mlp": 1.02316928, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 1.9811182495103772, "language_loss": 0.65338802, "learning_rate": 3.414488943621551e-06, "loss": 0.67553604, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.648681163787842 }, { "auxiliary_loss_clip": 0.01180206, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.05708086, "balance_loss_mlp": 1.02144992, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 3.436472819122573, "language_loss": 0.73715234, "learning_rate": 3.41393812749087e-06, "loss": 0.75925434, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.480748414993286 }, { "auxiliary_loss_clip": 0.01171464, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.05973148, "balance_loss_mlp": 1.02409697, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 2.1550656314549985, "language_loss": 0.71902907, "learning_rate": 3.4133870968668984e-06, "loss": 0.74106818, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.572136878967285 }, { "auxiliary_loss_clip": 0.0117183, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.05667353, "balance_loss_mlp": 1.02279353, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 1.788150724339478, "language_loss": 0.786641, "learning_rate": 3.412835851833229e-06, "loss": 0.80867851, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.5876986980438232 }, { "auxiliary_loss_clip": 0.01179797, "auxiliary_loss_mlp": 0.01030713, "balance_loss_clip": 1.05809331, "balance_loss_mlp": 1.0227077, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 1.8001756783304097, "language_loss": 0.77820289, "learning_rate": 3.4122843924734834e-06, "loss": 0.80030799, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.6624083518981934 }, { "auxiliary_loss_clip": 0.01164215, "auxiliary_loss_mlp": 0.01033999, "balance_loss_clip": 1.05291176, "balance_loss_mlp": 1.02509463, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 3.3996512260842264, "language_loss": 0.87893951, "learning_rate": 3.411732718871319e-06, "loss": 0.9009217, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.516752004623413 }, { "auxiliary_loss_clip": 0.01200705, "auxiliary_loss_mlp": 0.01043313, "balance_loss_clip": 1.06394768, "balance_loss_mlp": 1.03538251, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 1.5351493955583282, "language_loss": 0.78631657, "learning_rate": 3.4111808311104227e-06, "loss": 0.80875671, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.6020734310150146 }, { "auxiliary_loss_clip": 0.0117568, "auxiliary_loss_mlp": 0.0103138, "balance_loss_clip": 1.05491078, "balance_loss_mlp": 1.02276075, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 1.6908205418561253, "language_loss": 0.69405758, "learning_rate": 3.410628729274517e-06, "loss": 0.71612823, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.6533362865448 }, { "auxiliary_loss_clip": 0.01169293, "auxiliary_loss_mlp": 0.00762278, "balance_loss_clip": 1.05876446, "balance_loss_mlp": 1.00030541, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 1.8678985382687405, "language_loss": 0.8258878, "learning_rate": 3.4100764134473546e-06, "loss": 0.84520352, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 3.484149932861328 }, { "auxiliary_loss_clip": 0.01199705, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.06190908, "balance_loss_mlp": 1.02710927, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.440916894271007, "language_loss": 0.8498624, "learning_rate": 3.4095238837127215e-06, "loss": 0.8722232, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.523191452026367 }, { "auxiliary_loss_clip": 0.01151603, "auxiliary_loss_mlp": 0.0103484, "balance_loss_clip": 1.05229747, "balance_loss_mlp": 1.02612615, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 1.9720022296458541, "language_loss": 0.79697514, "learning_rate": 3.4089711401544355e-06, "loss": 0.81883955, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.6317927837371826 }, { "auxiliary_loss_clip": 0.01181717, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.05582213, "balance_loss_mlp": 1.02717209, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 2.1710850209138504, "language_loss": 0.67650348, "learning_rate": 3.4084181828563486e-06, "loss": 0.69868308, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.5354247093200684 }, { "auxiliary_loss_clip": 0.01138721, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.05287623, "balance_loss_mlp": 1.02623296, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.6218890834247193, "language_loss": 0.70605528, "learning_rate": 3.4078650119023428e-06, "loss": 0.72779459, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 2.6225030422210693 }, { "auxiliary_loss_clip": 0.01128205, "auxiliary_loss_mlp": 0.01029558, "balance_loss_clip": 1.04796946, "balance_loss_mlp": 1.02124929, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 2.036580690976605, "language_loss": 0.74273491, "learning_rate": 3.4073116273763337e-06, "loss": 0.76431257, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 2.6097829341888428 }, { "auxiliary_loss_clip": 0.01172522, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.05498958, "balance_loss_mlp": 1.020908, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 3.888509359255447, "language_loss": 0.81337821, "learning_rate": 3.40675802936227e-06, "loss": 0.83539987, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 3.3913345336914062 }, { "auxiliary_loss_clip": 0.01161393, "auxiliary_loss_mlp": 0.01034777, "balance_loss_clip": 1.05448782, "balance_loss_mlp": 1.02647996, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 2.2894344403407896, "language_loss": 0.71678078, "learning_rate": 3.4062042179441318e-06, "loss": 0.73874247, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 3.3291873931884766 }, { "auxiliary_loss_clip": 0.01183796, "auxiliary_loss_mlp": 0.01033019, "balance_loss_clip": 1.06016493, "balance_loss_mlp": 1.0252527, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 2.0746420146189757, "language_loss": 0.80552638, "learning_rate": 3.4056501932059314e-06, "loss": 0.82769454, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 2.5367634296417236 }, { "auxiliary_loss_clip": 0.01095811, "auxiliary_loss_mlp": 0.01005538, "balance_loss_clip": 1.02536678, "balance_loss_mlp": 1.00328541, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 0.780079428552975, "language_loss": 0.58127439, "learning_rate": 3.405095955231715e-06, "loss": 0.60228789, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 3.843421220779419 }, { "auxiliary_loss_clip": 0.01185487, "auxiliary_loss_mlp": 0.01029124, "balance_loss_clip": 1.05570102, "balance_loss_mlp": 1.02040982, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 2.1543327281224935, "language_loss": 0.94339073, "learning_rate": 3.4045415041055585e-06, "loss": 0.96553689, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 2.4744014739990234 }, { "auxiliary_loss_clip": 0.01175467, "auxiliary_loss_mlp": 0.01031281, "balance_loss_clip": 1.05705309, "balance_loss_mlp": 1.02267432, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 2.2405872840752363, "language_loss": 0.78392631, "learning_rate": 3.4039868399115728e-06, "loss": 0.80599374, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 2.509169578552246 }, { "auxiliary_loss_clip": 0.01133063, "auxiliary_loss_mlp": 0.01034206, "balance_loss_clip": 1.05610323, "balance_loss_mlp": 1.02559328, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 1.8219634809793295, "language_loss": 0.80644256, "learning_rate": 3.4034319627339003e-06, "loss": 0.82811522, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.5945608615875244 }, { "auxiliary_loss_clip": 0.0117601, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.05919862, "balance_loss_mlp": 1.02302563, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 2.447530144356101, "language_loss": 0.69527757, "learning_rate": 3.402876872656715e-06, "loss": 0.71736318, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 2.5797178745269775 }, { "auxiliary_loss_clip": 0.01170615, "auxiliary_loss_mlp": 0.01034501, "balance_loss_clip": 1.06035018, "balance_loss_mlp": 1.02588856, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 1.8859791511865065, "language_loss": 0.89366668, "learning_rate": 3.402321569764223e-06, "loss": 0.91571784, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.5705463886260986 }, { "auxiliary_loss_clip": 0.01143996, "auxiliary_loss_mlp": 0.00763037, "balance_loss_clip": 1.05294728, "balance_loss_mlp": 1.0003767, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 1.750682033908901, "language_loss": 0.83623439, "learning_rate": 3.4017660541406635e-06, "loss": 0.85530472, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.564131259918213 }, { "auxiliary_loss_clip": 0.01176769, "auxiliary_loss_mlp": 0.01033275, "balance_loss_clip": 1.05448079, "balance_loss_mlp": 1.02477527, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 1.6274894987992146, "language_loss": 0.74114525, "learning_rate": 3.4012103258703092e-06, "loss": 0.7632457, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.583829641342163 }, { "auxiliary_loss_clip": 0.01155931, "auxiliary_loss_mlp": 0.01027624, "balance_loss_clip": 1.05345964, "balance_loss_mlp": 1.01913071, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 1.9034205286247927, "language_loss": 0.83004677, "learning_rate": 3.4006543850374616e-06, "loss": 0.85188234, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.6084015369415283 }, { "auxiliary_loss_clip": 0.01185322, "auxiliary_loss_mlp": 0.01029381, "balance_loss_clip": 1.05609512, "balance_loss_mlp": 1.02111387, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 3.452400852496113, "language_loss": 0.74744749, "learning_rate": 3.400098231726458e-06, "loss": 0.76959455, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.496499538421631 }, { "auxiliary_loss_clip": 0.01160475, "auxiliary_loss_mlp": 0.01033232, "balance_loss_clip": 1.05353534, "balance_loss_mlp": 1.02452922, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 1.9006559471212872, "language_loss": 0.86941755, "learning_rate": 3.3995418660216657e-06, "loss": 0.89135456, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.569826602935791 }, { "auxiliary_loss_clip": 0.01206133, "auxiliary_loss_mlp": 0.01032502, "balance_loss_clip": 1.06126571, "balance_loss_mlp": 1.02295303, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 2.4926548021187394, "language_loss": 0.80531693, "learning_rate": 3.3989852880074848e-06, "loss": 0.82770324, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.5286197662353516 }, { "auxiliary_loss_clip": 0.01072357, "auxiliary_loss_mlp": 0.01002806, "balance_loss_clip": 1.02571154, "balance_loss_mlp": 1.00073195, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.7477702102275624, "language_loss": 0.60645652, "learning_rate": 3.398428497768348e-06, "loss": 0.62720811, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.259068250656128 }, { "auxiliary_loss_clip": 0.01164687, "auxiliary_loss_mlp": 0.01033976, "balance_loss_clip": 1.05507493, "balance_loss_mlp": 1.02591777, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 1.6307159120361676, "language_loss": 0.71894181, "learning_rate": 3.3978714953887205e-06, "loss": 0.74092841, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.5908689498901367 }, { "auxiliary_loss_clip": 0.01127996, "auxiliary_loss_mlp": 0.01032992, "balance_loss_clip": 1.04803848, "balance_loss_mlp": 1.02377665, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 1.6951924245105532, "language_loss": 0.85909843, "learning_rate": 3.397314280953098e-06, "loss": 0.88070828, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.646383762359619 }, { "auxiliary_loss_clip": 0.01165186, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.05380154, "balance_loss_mlp": 1.02311552, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 1.9609257505174187, "language_loss": 0.79776949, "learning_rate": 3.3967568545460108e-06, "loss": 0.81973922, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.5776305198669434 }, { "auxiliary_loss_clip": 0.01187015, "auxiliary_loss_mlp": 0.01031575, "balance_loss_clip": 1.061674, "balance_loss_mlp": 1.02259278, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 1.94470900507694, "language_loss": 0.80802852, "learning_rate": 3.3961992162520185e-06, "loss": 0.83021438, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.496711015701294 }, { "auxiliary_loss_clip": 0.0118406, "auxiliary_loss_mlp": 0.01030525, "balance_loss_clip": 1.0559057, "balance_loss_mlp": 1.02144158, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 2.0237109343743804, "language_loss": 0.71622896, "learning_rate": 3.3956413661557156e-06, "loss": 0.73837483, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 3.2322211265563965 }, { "auxiliary_loss_clip": 0.01161384, "auxiliary_loss_mlp": 0.01033318, "balance_loss_clip": 1.05302405, "balance_loss_mlp": 1.0242939, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 2.1263552932332885, "language_loss": 0.65990627, "learning_rate": 3.3950833043417273e-06, "loss": 0.68185329, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 2.573422431945801 }, { "auxiliary_loss_clip": 0.01189766, "auxiliary_loss_mlp": 0.01035358, "balance_loss_clip": 1.0613606, "balance_loss_mlp": 1.02589869, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 2.4389957663547936, "language_loss": 0.73478973, "learning_rate": 3.3945250308947105e-06, "loss": 0.75704098, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.495997905731201 }, { "auxiliary_loss_clip": 0.01084349, "auxiliary_loss_mlp": 0.01002615, "balance_loss_clip": 1.02532744, "balance_loss_mlp": 1.00048161, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.284581056719423, "language_loss": 0.68331122, "learning_rate": 3.3939665458993556e-06, "loss": 0.70418084, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.056732177734375 }, { "auxiliary_loss_clip": 0.01160896, "auxiliary_loss_mlp": 0.01027349, "balance_loss_clip": 1.05259442, "balance_loss_mlp": 1.01846802, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 2.3994453602002253, "language_loss": 0.76605582, "learning_rate": 3.3934078494403843e-06, "loss": 0.7879383, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 2.6541948318481445 }, { "auxiliary_loss_clip": 0.01108309, "auxiliary_loss_mlp": 0.00763169, "balance_loss_clip": 1.04957771, "balance_loss_mlp": 1.00035763, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 1.7117782930569578, "language_loss": 0.81321514, "learning_rate": 3.3928489416025495e-06, "loss": 0.83192986, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 2.699719190597534 }, { "auxiliary_loss_clip": 0.01169345, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.0562526, "balance_loss_mlp": 1.02475715, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 2.3706953056481535, "language_loss": 0.79159749, "learning_rate": 3.392289822470638e-06, "loss": 0.81362808, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 4.017672300338745 }, { "auxiliary_loss_clip": 0.01170894, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.05587888, "balance_loss_mlp": 1.0233078, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 3.0339486039165546, "language_loss": 0.76025456, "learning_rate": 3.3917304921294674e-06, "loss": 0.78228843, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 2.5429069995880127 }, { "auxiliary_loss_clip": 0.01186361, "auxiliary_loss_mlp": 0.01031375, "balance_loss_clip": 1.05800498, "balance_loss_mlp": 1.02219605, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 9.416918470282337, "language_loss": 0.80699545, "learning_rate": 3.3911709506638876e-06, "loss": 0.82917285, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.5394418239593506 }, { "auxiliary_loss_clip": 0.01145806, "auxiliary_loss_mlp": 0.00762606, "balance_loss_clip": 1.04922771, "balance_loss_mlp": 1.00033951, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 2.4037091303076115, "language_loss": 0.81399131, "learning_rate": 3.390611198158781e-06, "loss": 0.8330754, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 3.4194347858428955 }, { "auxiliary_loss_clip": 0.01201752, "auxiliary_loss_mlp": 0.01038655, "balance_loss_clip": 1.06038666, "balance_loss_mlp": 1.02965486, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 2.060080663708529, "language_loss": 0.89733112, "learning_rate": 3.3900512346990612e-06, "loss": 0.91973519, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.483231782913208 }, { "auxiliary_loss_clip": 0.01141226, "auxiliary_loss_mlp": 0.01035675, "balance_loss_clip": 1.04985797, "balance_loss_mlp": 1.02600145, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 1.5529337323387418, "language_loss": 0.6577422, "learning_rate": 3.389491060369674e-06, "loss": 0.67951119, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 2.753255605697632 }, { "auxiliary_loss_clip": 0.01136092, "auxiliary_loss_mlp": 0.01029333, "balance_loss_clip": 1.05210257, "balance_loss_mlp": 1.02030838, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 1.8922797477492919, "language_loss": 0.89742136, "learning_rate": 3.388930675255598e-06, "loss": 0.91907561, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.6003127098083496 }, { "auxiliary_loss_clip": 0.01176891, "auxiliary_loss_mlp": 0.01037093, "balance_loss_clip": 1.05801845, "balance_loss_mlp": 1.02878451, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 2.9341162273159656, "language_loss": 0.79254663, "learning_rate": 3.388370079441843e-06, "loss": 0.81468654, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.5437889099121094 }, { "auxiliary_loss_clip": 0.01165516, "auxiliary_loss_mlp": 0.01025489, "balance_loss_clip": 1.06128693, "balance_loss_mlp": 1.01664984, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 2.7830308280423215, "language_loss": 0.92708522, "learning_rate": 3.3878092730134505e-06, "loss": 0.94899529, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.5944576263427734 }, { "auxiliary_loss_clip": 0.01178343, "auxiliary_loss_mlp": 0.01033178, "balance_loss_clip": 1.05664587, "balance_loss_mlp": 1.02424335, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 1.6877238273571884, "language_loss": 0.80810231, "learning_rate": 3.3872482560554947e-06, "loss": 0.83021748, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.5167667865753174 }, { "auxiliary_loss_clip": 0.01078365, "auxiliary_loss_mlp": 0.01001793, "balance_loss_clip": 1.01988482, "balance_loss_mlp": 0.99956381, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.8015104850419964, "language_loss": 0.57017863, "learning_rate": 3.386687028653082e-06, "loss": 0.59098023, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.133065700531006 }, { "auxiliary_loss_clip": 0.01144245, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.05539536, "balance_loss_mlp": 1.02654564, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 1.806315407295578, "language_loss": 0.85074472, "learning_rate": 3.386125590891349e-06, "loss": 0.87254119, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.62072491645813 }, { "auxiliary_loss_clip": 0.0115682, "auxiliary_loss_mlp": 0.01029952, "balance_loss_clip": 1.05097592, "balance_loss_mlp": 1.02168489, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 1.9837047466976963, "language_loss": 0.82905364, "learning_rate": 3.3855639428554657e-06, "loss": 0.85092139, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.546367883682251 }, { "auxiliary_loss_clip": 0.01146372, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.0535562, "balance_loss_mlp": 1.02405477, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 1.753956472785239, "language_loss": 0.80786771, "learning_rate": 3.385002084630635e-06, "loss": 0.82965434, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.57684063911438 }, { "auxiliary_loss_clip": 0.01188095, "auxiliary_loss_mlp": 0.01032736, "balance_loss_clip": 1.05644846, "balance_loss_mlp": 1.02374172, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 1.8549229915251515, "language_loss": 0.85078871, "learning_rate": 3.384440016302088e-06, "loss": 0.87299699, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.500657796859741 }, { "auxiliary_loss_clip": 0.01179033, "auxiliary_loss_mlp": 0.01025053, "balance_loss_clip": 1.05489326, "balance_loss_mlp": 1.01639235, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.045960203096301, "language_loss": 0.61753112, "learning_rate": 3.3838777379550923e-06, "loss": 0.63957202, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.5185043811798096 }, { "auxiliary_loss_clip": 0.01172967, "auxiliary_loss_mlp": 0.01038195, "balance_loss_clip": 1.05738866, "balance_loss_mlp": 1.02950466, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 5.06074148636033, "language_loss": 0.7857995, "learning_rate": 3.383315249674944e-06, "loss": 0.80791122, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.5764336585998535 }, { "auxiliary_loss_clip": 0.01161928, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.05752814, "balance_loss_mlp": 1.02765036, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 1.8712862256211764, "language_loss": 0.85873115, "learning_rate": 3.3827525515469715e-06, "loss": 0.88071281, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.5874247550964355 }, { "auxiliary_loss_clip": 0.01151026, "auxiliary_loss_mlp": 0.0103359, "balance_loss_clip": 1.05195189, "balance_loss_mlp": 1.02407146, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 2.1852777591156536, "language_loss": 0.70642447, "learning_rate": 3.3821896436565367e-06, "loss": 0.72827065, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.5841286182403564 }, { "auxiliary_loss_clip": 0.01190584, "auxiliary_loss_mlp": 0.01035146, "balance_loss_clip": 1.06318879, "balance_loss_mlp": 1.02670014, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 1.6838341951319196, "language_loss": 0.70392448, "learning_rate": 3.381626526089032e-06, "loss": 0.72618175, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 3.3648955821990967 }, { "auxiliary_loss_clip": 0.01167087, "auxiliary_loss_mlp": 0.01032214, "balance_loss_clip": 1.05392408, "balance_loss_mlp": 1.02344632, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 1.9927110247272322, "language_loss": 0.79081529, "learning_rate": 3.3810631989298815e-06, "loss": 0.81280828, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.543272018432617 }, { "auxiliary_loss_clip": 0.01150093, "auxiliary_loss_mlp": 0.01029562, "balance_loss_clip": 1.05717731, "balance_loss_mlp": 1.02026916, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 1.974147088696905, "language_loss": 0.83774936, "learning_rate": 3.3804996622645423e-06, "loss": 0.85954589, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.626181125640869 }, { "auxiliary_loss_clip": 0.01200029, "auxiliary_loss_mlp": 0.01032782, "balance_loss_clip": 1.05928755, "balance_loss_mlp": 1.02386498, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 1.8668572346391497, "language_loss": 0.89678979, "learning_rate": 3.3799359161785015e-06, "loss": 0.91911787, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.508880615234375 }, { "auxiliary_loss_clip": 0.01186627, "auxiliary_loss_mlp": 0.01026439, "balance_loss_clip": 1.05880737, "balance_loss_mlp": 1.01773667, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.5293267680037377, "language_loss": 0.85518348, "learning_rate": 3.3793719607572798e-06, "loss": 0.87731409, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 2.582002639770508 }, { "auxiliary_loss_clip": 0.01154672, "auxiliary_loss_mlp": 0.01033533, "balance_loss_clip": 1.05176592, "balance_loss_mlp": 1.02453828, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 1.883495065171994, "language_loss": 0.76823884, "learning_rate": 3.378807796086428e-06, "loss": 0.7901209, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 2.649073839187622 }, { "auxiliary_loss_clip": 0.01203426, "auxiliary_loss_mlp": 0.01036393, "balance_loss_clip": 1.06290126, "balance_loss_mlp": 1.02768493, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 3.5502929597227815, "language_loss": 0.76534289, "learning_rate": 3.37824342225153e-06, "loss": 0.78774107, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.4641940593719482 }, { "auxiliary_loss_clip": 0.01146351, "auxiliary_loss_mlp": 0.01027906, "balance_loss_clip": 1.05596852, "balance_loss_mlp": 1.01950204, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 1.8384849859055858, "language_loss": 0.77740169, "learning_rate": 3.3776788393382006e-06, "loss": 0.79914427, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 4.097648620605469 }, { "auxiliary_loss_clip": 0.01204909, "auxiliary_loss_mlp": 0.01035276, "balance_loss_clip": 1.06305301, "balance_loss_mlp": 1.02567995, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 2.3644137433015375, "language_loss": 0.76232588, "learning_rate": 3.3771140474320872e-06, "loss": 0.78472775, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 2.5466911792755127 }, { "auxiliary_loss_clip": 0.01166638, "auxiliary_loss_mlp": 0.010351, "balance_loss_clip": 1.05780804, "balance_loss_mlp": 1.02610588, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 2.075772375120525, "language_loss": 0.79563433, "learning_rate": 3.3765490466188664e-06, "loss": 0.81765175, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 2.5882232189178467 }, { "auxiliary_loss_clip": 0.01159415, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.0573988, "balance_loss_mlp": 1.02964568, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 2.465579958995106, "language_loss": 0.73542249, "learning_rate": 3.3759838369842508e-06, "loss": 0.75740123, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 3.3503365516662598 }, { "auxiliary_loss_clip": 0.01156852, "auxiliary_loss_mlp": 0.01030001, "balance_loss_clip": 1.05525935, "balance_loss_mlp": 1.02116799, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 2.0364294958930533, "language_loss": 0.72825623, "learning_rate": 3.375418418613981e-06, "loss": 0.75012481, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 2.5756640434265137 }, { "auxiliary_loss_clip": 0.01178136, "auxiliary_loss_mlp": 0.01036338, "balance_loss_clip": 1.05980349, "balance_loss_mlp": 1.02750492, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 2.236579214357391, "language_loss": 0.82947671, "learning_rate": 3.374852791593831e-06, "loss": 0.85162145, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 2.5386619567871094 }, { "auxiliary_loss_clip": 0.01155276, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.05408335, "balance_loss_mlp": 1.02631593, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 2.2226510703510614, "language_loss": 0.5382067, "learning_rate": 3.374286956009605e-06, "loss": 0.5601145, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.5844810009002686 }, { "auxiliary_loss_clip": 0.01192518, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.06561363, "balance_loss_mlp": 1.02943969, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.108964411793755, "language_loss": 0.75374472, "learning_rate": 3.3737209119471405e-06, "loss": 0.77605766, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.497767210006714 }, { "auxiliary_loss_clip": 0.01195843, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.06292367, "balance_loss_mlp": 1.02721965, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 2.3740089726549316, "language_loss": 0.63649601, "learning_rate": 3.373154659492306e-06, "loss": 0.65882552, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.506227493286133 }, { "auxiliary_loss_clip": 0.01179232, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.06135356, "balance_loss_mlp": 1.02130616, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 2.6452116813264626, "language_loss": 0.85453004, "learning_rate": 3.3725881987310016e-06, "loss": 0.87662512, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.553468942642212 }, { "auxiliary_loss_clip": 0.01173539, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.05762947, "balance_loss_mlp": 1.02451849, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 1.7948437171279177, "language_loss": 0.87716126, "learning_rate": 3.372021529749159e-06, "loss": 0.89923245, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.563180685043335 }, { "auxiliary_loss_clip": 0.01139057, "auxiliary_loss_mlp": 0.01027116, "balance_loss_clip": 1.05819237, "balance_loss_mlp": 1.01814556, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 1.9741260920870247, "language_loss": 0.92215884, "learning_rate": 3.3714546526327405e-06, "loss": 0.94382048, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.679696798324585 }, { "auxiliary_loss_clip": 0.01166002, "auxiliary_loss_mlp": 0.01038705, "balance_loss_clip": 1.05743122, "balance_loss_mlp": 1.02863216, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 2.4276310243663763, "language_loss": 0.87668025, "learning_rate": 3.3708875674677423e-06, "loss": 0.8987273, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.62688946723938 }, { "auxiliary_loss_clip": 0.01184987, "auxiliary_loss_mlp": 0.01031287, "balance_loss_clip": 1.06151104, "balance_loss_mlp": 1.02087379, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 2.0105405331850967, "language_loss": 0.8328501, "learning_rate": 3.37032027434019e-06, "loss": 0.85501289, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.6030330657958984 }, { "auxiliary_loss_clip": 0.01199203, "auxiliary_loss_mlp": 0.01049297, "balance_loss_clip": 1.06237841, "balance_loss_mlp": 1.03896761, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 1.6571212460476064, "language_loss": 0.82804948, "learning_rate": 3.369752773336141e-06, "loss": 0.8505345, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.5261080265045166 }, { "auxiliary_loss_clip": 0.01176417, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.05879474, "balance_loss_mlp": 1.02444863, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 1.5752937694689921, "language_loss": 0.77824962, "learning_rate": 3.3691850645416864e-06, "loss": 0.80035549, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.5922775268554688 }, { "auxiliary_loss_clip": 0.01193897, "auxiliary_loss_mlp": 0.01038396, "balance_loss_clip": 1.06050467, "balance_loss_mlp": 1.02915764, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 4.3372757036243215, "language_loss": 0.83092844, "learning_rate": 3.368617148042945e-06, "loss": 0.8532514, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.4736721515655518 }, { "auxiliary_loss_clip": 0.01168754, "auxiliary_loss_mlp": 0.01037254, "balance_loss_clip": 1.05486274, "balance_loss_mlp": 1.02809334, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 2.236359607953175, "language_loss": 0.8492946, "learning_rate": 3.368049023926071e-06, "loss": 0.87135464, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.5265262126922607 }, { "auxiliary_loss_clip": 0.01187788, "auxiliary_loss_mlp": 0.01034188, "balance_loss_clip": 1.0608263, "balance_loss_mlp": 1.0255456, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.601790612410944, "language_loss": 0.84018081, "learning_rate": 3.3674806922772476e-06, "loss": 0.86240053, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.560978889465332 }, { "auxiliary_loss_clip": 0.0116557, "auxiliary_loss_mlp": 0.0103794, "balance_loss_clip": 1.05657578, "balance_loss_mlp": 1.02911222, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 2.1271662017492545, "language_loss": 0.75183511, "learning_rate": 3.3669121531826904e-06, "loss": 0.77387017, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 3.417557954788208 }, { "auxiliary_loss_clip": 0.01154916, "auxiliary_loss_mlp": 0.01036231, "balance_loss_clip": 1.05951262, "balance_loss_mlp": 1.02664709, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 2.2179443730072914, "language_loss": 0.83052421, "learning_rate": 3.366343406728647e-06, "loss": 0.85243565, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.5498199462890625 }, { "auxiliary_loss_clip": 0.0117959, "auxiliary_loss_mlp": 0.01034225, "balance_loss_clip": 1.05512118, "balance_loss_mlp": 1.02495015, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 1.8212585966856105, "language_loss": 0.68572551, "learning_rate": 3.3657744530013946e-06, "loss": 0.70786363, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 2.5268144607543945 }, { "auxiliary_loss_clip": 0.01195889, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.06276, "balance_loss_mlp": 1.03218603, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 1.854523478834294, "language_loss": 0.71479404, "learning_rate": 3.3652052920872437e-06, "loss": 0.73716342, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 2.704134225845337 }, { "auxiliary_loss_clip": 0.011781, "auxiliary_loss_mlp": 0.01028989, "balance_loss_clip": 1.05972373, "balance_loss_mlp": 1.01967824, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 2.1982389545300576, "language_loss": 0.85546333, "learning_rate": 3.3646359240725355e-06, "loss": 0.87753421, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 2.5805399417877197 }, { "auxiliary_loss_clip": 0.01184779, "auxiliary_loss_mlp": 0.00763648, "balance_loss_clip": 1.05962467, "balance_loss_mlp": 1.00037503, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 1.8878251582563985, "language_loss": 0.67490673, "learning_rate": 3.364066349043643e-06, "loss": 0.69439101, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 2.59031343460083 }, { "auxiliary_loss_clip": 0.01169022, "auxiliary_loss_mlp": 0.01027325, "balance_loss_clip": 1.05657482, "balance_loss_mlp": 1.01900434, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 1.7408698155109856, "language_loss": 0.8215242, "learning_rate": 3.363496567086969e-06, "loss": 0.84348768, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.5392723083496094 }, { "auxiliary_loss_clip": 0.01202552, "auxiliary_loss_mlp": 0.01040598, "balance_loss_clip": 1.06109214, "balance_loss_mlp": 1.03141856, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 1.8409664747447745, "language_loss": 0.7579217, "learning_rate": 3.3629265782889506e-06, "loss": 0.78035325, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 3.4181575775146484 }, { "auxiliary_loss_clip": 0.01154553, "auxiliary_loss_mlp": 0.01039057, "balance_loss_clip": 1.05246568, "balance_loss_mlp": 1.02992535, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 1.758168405456009, "language_loss": 0.71616125, "learning_rate": 3.362356382736054e-06, "loss": 0.73809731, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 2.641368865966797 }, { "auxiliary_loss_clip": 0.01157354, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.05092144, "balance_loss_mlp": 1.02192473, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 1.9150518708386102, "language_loss": 0.90632254, "learning_rate": 3.361785980514777e-06, "loss": 0.92819941, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 3.309847593307495 }, { "auxiliary_loss_clip": 0.01126675, "auxiliary_loss_mlp": 0.01032611, "balance_loss_clip": 1.05270445, "balance_loss_mlp": 1.02299106, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 2.2472897167114882, "language_loss": 0.76732624, "learning_rate": 3.361215371711649e-06, "loss": 0.78891909, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.6080105304718018 }, { "auxiliary_loss_clip": 0.01150617, "auxiliary_loss_mlp": 0.01036189, "balance_loss_clip": 1.05474639, "balance_loss_mlp": 1.02740288, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 1.727049474427333, "language_loss": 0.83204579, "learning_rate": 3.3606445564132326e-06, "loss": 0.85391378, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 2.562739372253418 }, { "auxiliary_loss_clip": 0.0120641, "auxiliary_loss_mlp": 0.00762966, "balance_loss_clip": 1.06317496, "balance_loss_mlp": 1.00036168, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 2.3058760286327233, "language_loss": 0.82320356, "learning_rate": 3.360073534706118e-06, "loss": 0.8428973, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 2.4814774990081787 }, { "auxiliary_loss_clip": 0.01175418, "auxiliary_loss_mlp": 0.010362, "balance_loss_clip": 1.05887008, "balance_loss_mlp": 1.02600169, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 1.9772328012963118, "language_loss": 0.75779009, "learning_rate": 3.35950230667693e-06, "loss": 0.77990627, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.6726913452148438 }, { "auxiliary_loss_clip": 0.01188882, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.05864322, "balance_loss_mlp": 1.02774477, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 2.2543759259073575, "language_loss": 0.86099052, "learning_rate": 3.358930872412323e-06, "loss": 0.88324583, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.47017502784729 }, { "auxiliary_loss_clip": 0.01185447, "auxiliary_loss_mlp": 0.0103826, "balance_loss_clip": 1.05976987, "balance_loss_mlp": 1.02911663, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.5921565102871085, "language_loss": 0.80955541, "learning_rate": 3.3583592319989825e-06, "loss": 0.83179247, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.5105650424957275 }, { "auxiliary_loss_clip": 0.01194523, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.06066811, "balance_loss_mlp": 1.02436018, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 2.1606507421283534, "language_loss": 0.68655193, "learning_rate": 3.357787385523627e-06, "loss": 0.70884442, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.5969314575195312 }, { "auxiliary_loss_clip": 0.01138095, "auxiliary_loss_mlp": 0.01039012, "balance_loss_clip": 1.05304301, "balance_loss_mlp": 1.02979755, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 2.8961190017883447, "language_loss": 0.82635599, "learning_rate": 3.3572153330730048e-06, "loss": 0.84812713, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.687941789627075 }, { "auxiliary_loss_clip": 0.01062609, "auxiliary_loss_mlp": 0.01004592, "balance_loss_clip": 1.02489674, "balance_loss_mlp": 1.00268495, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 0.832670656776317, "language_loss": 0.64721942, "learning_rate": 3.3566430747338956e-06, "loss": 0.66789144, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.0093696117401123 }, { "auxiliary_loss_clip": 0.01189633, "auxiliary_loss_mlp": 0.0103662, "balance_loss_clip": 1.05733109, "balance_loss_mlp": 1.02780485, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 1.9115927239543933, "language_loss": 0.86490762, "learning_rate": 3.35607061059311e-06, "loss": 0.8871702, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.4733998775482178 }, { "auxiliary_loss_clip": 0.01198364, "auxiliary_loss_mlp": 0.01029106, "balance_loss_clip": 1.05984664, "balance_loss_mlp": 1.01975393, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 1.7484409438440027, "language_loss": 0.74846464, "learning_rate": 3.3554979407374917e-06, "loss": 0.77073932, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.5399396419525146 }, { "auxiliary_loss_clip": 0.0118602, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.05640125, "balance_loss_mlp": 1.02691638, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 1.4832880198675957, "language_loss": 0.73315066, "learning_rate": 3.3549250652539134e-06, "loss": 0.75537217, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.510124921798706 }, { "auxiliary_loss_clip": 0.0117129, "auxiliary_loss_mlp": 0.01035296, "balance_loss_clip": 1.05431974, "balance_loss_mlp": 1.02597356, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 2.3378263483364305, "language_loss": 0.8161844, "learning_rate": 3.3543519842292794e-06, "loss": 0.83825022, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.5563879013061523 }, { "auxiliary_loss_clip": 0.01202957, "auxiliary_loss_mlp": 0.00762767, "balance_loss_clip": 1.06213975, "balance_loss_mlp": 1.00037646, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 1.938395407520442, "language_loss": 0.83594716, "learning_rate": 3.353778697750527e-06, "loss": 0.85560429, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.470747709274292 }, { "auxiliary_loss_clip": 0.01168328, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.05692816, "balance_loss_mlp": 1.02667761, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 1.547567901401587, "language_loss": 0.89322853, "learning_rate": 3.353205205904622e-06, "loss": 0.91527021, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.5622775554656982 }, { "auxiliary_loss_clip": 0.01171637, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.05634058, "balance_loss_mlp": 1.02152228, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 2.392648938616002, "language_loss": 0.71414751, "learning_rate": 3.3526315087785637e-06, "loss": 0.73617262, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.7424986362457275 }, { "auxiliary_loss_clip": 0.01126984, "auxiliary_loss_mlp": 0.01026086, "balance_loss_clip": 1.05321229, "balance_loss_mlp": 1.01690638, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.9162436880566267, "language_loss": 0.80860561, "learning_rate": 3.3520576064593805e-06, "loss": 0.8301363, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 3.4274961948394775 }, { "auxiliary_loss_clip": 0.01195215, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.06215811, "balance_loss_mlp": 1.02896094, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.5007515788121881, "language_loss": 0.81920648, "learning_rate": 3.3514834990341337e-06, "loss": 0.84154236, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.523139238357544 }, { "auxiliary_loss_clip": 0.01180616, "auxiliary_loss_mlp": 0.0103072, "balance_loss_clip": 1.05916286, "balance_loss_mlp": 1.02169561, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 2.3154240960621304, "language_loss": 0.92843497, "learning_rate": 3.3509091865899144e-06, "loss": 0.95054829, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.5333428382873535 }, { "auxiliary_loss_clip": 0.01201899, "auxiliary_loss_mlp": 0.01038295, "balance_loss_clip": 1.05971718, "balance_loss_mlp": 1.02817988, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 1.8821793016245674, "language_loss": 0.7055462, "learning_rate": 3.350334669213846e-06, "loss": 0.72794819, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 2.471461057662964 }, { "auxiliary_loss_clip": 0.01188812, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.06231141, "balance_loss_mlp": 1.02362692, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 2.165408972904066, "language_loss": 0.75717354, "learning_rate": 3.3497599469930816e-06, "loss": 0.77938724, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 2.532580852508545 }, { "auxiliary_loss_clip": 0.01203972, "auxiliary_loss_mlp": 0.01029673, "balance_loss_clip": 1.06198025, "balance_loss_mlp": 1.01948667, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 2.400974629201936, "language_loss": 0.82978415, "learning_rate": 3.349185020014807e-06, "loss": 0.85212064, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.466705799102783 }, { "auxiliary_loss_clip": 0.0119085, "auxiliary_loss_mlp": 0.01035732, "balance_loss_clip": 1.05953276, "balance_loss_mlp": 1.02580822, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 1.9465267917010782, "language_loss": 0.74525058, "learning_rate": 3.348609888366237e-06, "loss": 0.76751637, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 2.5319015979766846 }, { "auxiliary_loss_clip": 0.01124293, "auxiliary_loss_mlp": 0.01028558, "balance_loss_clip": 1.05058908, "balance_loss_mlp": 1.01952195, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 2.0865753703418295, "language_loss": 0.62901264, "learning_rate": 3.348034552134619e-06, "loss": 0.65054113, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 4.19303297996521 }, { "auxiliary_loss_clip": 0.01138827, "auxiliary_loss_mlp": 0.01036387, "balance_loss_clip": 1.0536809, "balance_loss_mlp": 1.02671933, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 3.3574807857846376, "language_loss": 0.84212196, "learning_rate": 3.3474590114072316e-06, "loss": 0.86387414, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 2.5792152881622314 }, { "auxiliary_loss_clip": 0.01157049, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.05846095, "balance_loss_mlp": 1.02374887, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 1.9521252083613763, "language_loss": 0.82797921, "learning_rate": 3.3468832662713836e-06, "loss": 0.84988463, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 3.395561695098877 }, { "auxiliary_loss_clip": 0.0115961, "auxiliary_loss_mlp": 0.01035869, "balance_loss_clip": 1.05763149, "balance_loss_mlp": 1.02719617, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 2.4467970638605743, "language_loss": 0.8390286, "learning_rate": 3.346307316814415e-06, "loss": 0.86098337, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.5318331718444824 }, { "auxiliary_loss_clip": 0.01184453, "auxiliary_loss_mlp": 0.01034529, "balance_loss_clip": 1.05925167, "balance_loss_mlp": 1.02499795, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 1.981796763167291, "language_loss": 0.75437748, "learning_rate": 3.3457311631236965e-06, "loss": 0.77656734, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 2.519674301147461 }, { "auxiliary_loss_clip": 0.01159724, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.05405605, "balance_loss_mlp": 1.02671838, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 1.6585515317572712, "language_loss": 0.84537327, "learning_rate": 3.345154805286631e-06, "loss": 0.86733323, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 2.5627686977386475 }, { "auxiliary_loss_clip": 0.01181423, "auxiliary_loss_mlp": 0.01047823, "balance_loss_clip": 1.05673647, "balance_loss_mlp": 1.0378989, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 3.229318789910953, "language_loss": 0.7604515, "learning_rate": 3.344578243390651e-06, "loss": 0.78274405, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.4615721702575684 }, { "auxiliary_loss_clip": 0.01172161, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.05871987, "balance_loss_mlp": 1.02263975, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 2.3616347103555557, "language_loss": 0.78692114, "learning_rate": 3.3440014775232206e-06, "loss": 0.80896187, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.5250093936920166 }, { "auxiliary_loss_clip": 0.01162949, "auxiliary_loss_mlp": 0.01031933, "balance_loss_clip": 1.05780399, "balance_loss_mlp": 1.02329612, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 2.0477771162669205, "language_loss": 0.71003515, "learning_rate": 3.343424507771834e-06, "loss": 0.73198402, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.5756216049194336 }, { "auxiliary_loss_clip": 0.011604, "auxiliary_loss_mlp": 0.01035492, "balance_loss_clip": 1.05760288, "balance_loss_mlp": 1.02659321, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 1.727992258144669, "language_loss": 0.86510241, "learning_rate": 3.342847334224018e-06, "loss": 0.8870613, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.5388665199279785 }, { "auxiliary_loss_clip": 0.01083563, "auxiliary_loss_mlp": 0.01007444, "balance_loss_clip": 1.02665806, "balance_loss_mlp": 1.00554848, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9435571300786746, "language_loss": 0.62343264, "learning_rate": 3.342269956967329e-06, "loss": 0.64434278, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.1629865169525146 }, { "auxiliary_loss_clip": 0.01188591, "auxiliary_loss_mlp": 0.01041098, "balance_loss_clip": 1.05799174, "balance_loss_mlp": 1.03106642, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 3.0317380246266143, "language_loss": 0.7150296, "learning_rate": 3.341692376089355e-06, "loss": 0.7373265, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.5881645679473877 }, { "auxiliary_loss_clip": 0.01184814, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.05934167, "balance_loss_mlp": 1.02463818, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 4.542591623399206, "language_loss": 0.83471435, "learning_rate": 3.3411145916777146e-06, "loss": 0.85690093, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.528890609741211 }, { "auxiliary_loss_clip": 0.01165468, "auxiliary_loss_mlp": 0.0103917, "balance_loss_clip": 1.05615056, "balance_loss_mlp": 1.02934766, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 2.991114731114391, "language_loss": 0.90647137, "learning_rate": 3.3405366038200566e-06, "loss": 0.92851782, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.529747247695923 }, { "auxiliary_loss_clip": 0.01176874, "auxiliary_loss_mlp": 0.01036416, "balance_loss_clip": 1.06057322, "balance_loss_mlp": 1.02661109, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 2.3936783733360767, "language_loss": 0.85164142, "learning_rate": 3.3399584126040617e-06, "loss": 0.87377429, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.568298578262329 }, { "auxiliary_loss_clip": 0.01201693, "auxiliary_loss_mlp": 0.00762693, "balance_loss_clip": 1.06080699, "balance_loss_mlp": 1.00033689, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 3.489197519283409, "language_loss": 0.90500599, "learning_rate": 3.339380018117441e-06, "loss": 0.92464983, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.5237443447113037 }, { "auxiliary_loss_clip": 0.01187878, "auxiliary_loss_mlp": 0.01030123, "balance_loss_clip": 1.0638566, "balance_loss_mlp": 1.02148676, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 2.3679014595306516, "language_loss": 0.78726918, "learning_rate": 3.3388014204479366e-06, "loss": 0.8094492, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.4822442531585693 }, { "auxiliary_loss_clip": 0.0120252, "auxiliary_loss_mlp": 0.01037009, "balance_loss_clip": 1.06066263, "balance_loss_mlp": 1.02738905, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 1.96814232368608, "language_loss": 0.9164784, "learning_rate": 3.338222619683321e-06, "loss": 0.93887377, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.5004332065582275 }, { "auxiliary_loss_clip": 0.01174755, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 1.05746579, "balance_loss_mlp": 1.0267539, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 2.4464112861387064, "language_loss": 0.72994709, "learning_rate": 3.337643615911398e-06, "loss": 0.75205296, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.542466163635254 }, { "auxiliary_loss_clip": 0.0118521, "auxiliary_loss_mlp": 0.01039341, "balance_loss_clip": 1.0557189, "balance_loss_mlp": 1.02969742, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 1.9784324965254685, "language_loss": 0.78910029, "learning_rate": 3.3370644092200026e-06, "loss": 0.81134582, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 3.314467668533325 }, { "auxiliary_loss_clip": 0.01144274, "auxiliary_loss_mlp": 0.01033493, "balance_loss_clip": 1.0510242, "balance_loss_mlp": 1.02460027, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 4.122704915882568, "language_loss": 0.78268206, "learning_rate": 3.3364849996969985e-06, "loss": 0.80445969, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.568740129470825 }, { "auxiliary_loss_clip": 0.01184798, "auxiliary_loss_mlp": 0.01035061, "balance_loss_clip": 1.05916619, "balance_loss_mlp": 1.02557826, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 1.900027722799323, "language_loss": 0.85315907, "learning_rate": 3.335905387430283e-06, "loss": 0.87535769, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 2.572610855102539 }, { "auxiliary_loss_clip": 0.01175205, "auxiliary_loss_mlp": 0.01034917, "balance_loss_clip": 1.05535316, "balance_loss_mlp": 1.02660751, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 1.9446456717219711, "language_loss": 0.83019471, "learning_rate": 3.335325572507782e-06, "loss": 0.85229599, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.565513849258423 }, { "auxiliary_loss_clip": 0.01204574, "auxiliary_loss_mlp": 0.00762784, "balance_loss_clip": 1.06504011, "balance_loss_mlp": 1.00036597, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 1.703996587734889, "language_loss": 0.74078274, "learning_rate": 3.3347455550174537e-06, "loss": 0.76045632, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 2.49052357673645 }, { "auxiliary_loss_clip": 0.01152303, "auxiliary_loss_mlp": 0.01031247, "balance_loss_clip": 1.05135429, "balance_loss_mlp": 1.02257395, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 1.6652870265105486, "language_loss": 0.67954171, "learning_rate": 3.3341653350472864e-06, "loss": 0.70137727, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 2.517117738723755 }, { "auxiliary_loss_clip": 0.0120579, "auxiliary_loss_mlp": 0.01034307, "balance_loss_clip": 1.0603354, "balance_loss_mlp": 1.02379906, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 2.4811104187625452, "language_loss": 0.6946069, "learning_rate": 3.333584912685298e-06, "loss": 0.71700788, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.5228824615478516 }, { "auxiliary_loss_clip": 0.01059413, "auxiliary_loss_mlp": 0.01006232, "balance_loss_clip": 1.02534842, "balance_loss_mlp": 1.0043366, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.8970236442131861, "language_loss": 0.55562925, "learning_rate": 3.3330042880195385e-06, "loss": 0.57628572, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 3.801926612854004 }, { "auxiliary_loss_clip": 0.01170732, "auxiliary_loss_mlp": 0.01028572, "balance_loss_clip": 1.05524683, "balance_loss_mlp": 1.01957738, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 2.131850535188781, "language_loss": 0.78816617, "learning_rate": 3.3324234611380888e-06, "loss": 0.81015921, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 3.307101249694824 }, { "auxiliary_loss_clip": 0.01149658, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.05336714, "balance_loss_mlp": 1.02729964, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 2.312450210385715, "language_loss": 0.81642991, "learning_rate": 3.3318424321290596e-06, "loss": 0.838287, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 3.31197190284729 }, { "auxiliary_loss_clip": 0.01050656, "auxiliary_loss_mlp": 0.01000991, "balance_loss_clip": 1.01744866, "balance_loss_mlp": 0.99914283, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 0.8273849352883264, "language_loss": 0.59949553, "learning_rate": 3.3312612010805917e-06, "loss": 0.62001193, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 3.201558828353882 }, { "auxiliary_loss_clip": 0.01158828, "auxiliary_loss_mlp": 0.01038101, "balance_loss_clip": 1.05455792, "balance_loss_mlp": 1.02876127, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 1.7020718499068668, "language_loss": 0.7019453, "learning_rate": 3.330679768080858e-06, "loss": 0.72391456, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 2.600551128387451 }, { "auxiliary_loss_clip": 0.01185823, "auxiliary_loss_mlp": 0.01042939, "balance_loss_clip": 1.06016445, "balance_loss_mlp": 1.03324091, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 2.3670046855639844, "language_loss": 0.83202106, "learning_rate": 3.3300981332180627e-06, "loss": 0.85430872, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 2.5840582847595215 }, { "auxiliary_loss_clip": 0.01161934, "auxiliary_loss_mlp": 0.01036163, "balance_loss_clip": 1.05605626, "balance_loss_mlp": 1.02699614, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 1.9505864914524436, "language_loss": 0.79842639, "learning_rate": 3.3295162965804373e-06, "loss": 0.82040739, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.5302350521087646 }, { "auxiliary_loss_clip": 0.0115445, "auxiliary_loss_mlp": 0.01034533, "balance_loss_clip": 1.05459321, "balance_loss_mlp": 1.02594972, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 2.2295140351066265, "language_loss": 0.78410506, "learning_rate": 3.328934258256247e-06, "loss": 0.80599493, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.5521957874298096 }, { "auxiliary_loss_clip": 0.01182792, "auxiliary_loss_mlp": 0.01035066, "balance_loss_clip": 1.05612218, "balance_loss_mlp": 1.02612495, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 3.079278819489508, "language_loss": 0.67013514, "learning_rate": 3.3283520183337856e-06, "loss": 0.69231373, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.5344526767730713 }, { "auxiliary_loss_clip": 0.01168346, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 1.05508685, "balance_loss_mlp": 1.02204204, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 1.7730347837345348, "language_loss": 0.68783128, "learning_rate": 3.3277695769013797e-06, "loss": 0.70982516, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.574254274368286 }, { "auxiliary_loss_clip": 0.01186153, "auxiliary_loss_mlp": 0.01036302, "balance_loss_clip": 1.05865359, "balance_loss_mlp": 1.02687287, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 1.9148056965129299, "language_loss": 0.77428234, "learning_rate": 3.327186934047385e-06, "loss": 0.79650688, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.5046346187591553 }, { "auxiliary_loss_clip": 0.01158492, "auxiliary_loss_mlp": 0.01038072, "balance_loss_clip": 1.05109048, "balance_loss_mlp": 1.02910709, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 3.04833949111504, "language_loss": 0.655424, "learning_rate": 3.3266040898601877e-06, "loss": 0.67738962, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.5122509002685547 }, { "auxiliary_loss_clip": 0.0113248, "auxiliary_loss_mlp": 0.01035251, "balance_loss_clip": 1.04723775, "balance_loss_mlp": 1.0270257, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 1.8914826358752002, "language_loss": 0.77827018, "learning_rate": 3.3260210444282045e-06, "loss": 0.7999475, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.6015543937683105 }, { "auxiliary_loss_clip": 0.01177379, "auxiliary_loss_mlp": 0.01030086, "balance_loss_clip": 1.05566704, "balance_loss_mlp": 1.02141345, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.1318419610532513, "language_loss": 0.73121077, "learning_rate": 3.325437797839883e-06, "loss": 0.75328541, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.529358148574829 }, { "auxiliary_loss_clip": 0.01200246, "auxiliary_loss_mlp": 0.0103535, "balance_loss_clip": 1.05909157, "balance_loss_mlp": 1.02655888, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 2.418907675710475, "language_loss": 0.75137031, "learning_rate": 3.3248543501837015e-06, "loss": 0.77372622, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.52130126953125 }, { "auxiliary_loss_clip": 0.01151372, "auxiliary_loss_mlp": 0.01034171, "balance_loss_clip": 1.05811715, "balance_loss_mlp": 1.02617812, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 1.8058898493029816, "language_loss": 0.77121162, "learning_rate": 3.3242707015481684e-06, "loss": 0.7930671, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.6164917945861816 }, { "auxiliary_loss_clip": 0.01166409, "auxiliary_loss_mlp": 0.0103099, "balance_loss_clip": 1.04963052, "balance_loss_mlp": 1.02228224, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 1.635185620823894, "language_loss": 0.80908477, "learning_rate": 3.323686852021823e-06, "loss": 0.83105874, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.5039713382720947 }, { "auxiliary_loss_clip": 0.01159832, "auxiliary_loss_mlp": 0.01037532, "balance_loss_clip": 1.05212998, "balance_loss_mlp": 1.02793002, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 1.9354214409264983, "language_loss": 0.79396296, "learning_rate": 3.323102801693235e-06, "loss": 0.81593657, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.5924267768859863 }, { "auxiliary_loss_clip": 0.01176676, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.05561376, "balance_loss_mlp": 1.0235368, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 2.2368522512028135, "language_loss": 0.80944872, "learning_rate": 3.322518550651003e-06, "loss": 0.83154368, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.535911798477173 }, { "auxiliary_loss_clip": 0.01174391, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.053864, "balance_loss_mlp": 1.02587557, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 1.6596149562710818, "language_loss": 0.81304467, "learning_rate": 3.3219340989837586e-06, "loss": 0.83513391, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 3.3419485092163086 }, { "auxiliary_loss_clip": 0.01169841, "auxiliary_loss_mlp": 0.01034004, "balance_loss_clip": 1.05479693, "balance_loss_mlp": 1.02622008, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 1.698795395695219, "language_loss": 0.80147791, "learning_rate": 3.3213494467801625e-06, "loss": 0.82351631, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 2.5549981594085693 }, { "auxiliary_loss_clip": 0.01111268, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.04521585, "balance_loss_mlp": 1.02389598, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 1.992233760305085, "language_loss": 0.71436083, "learning_rate": 3.3207645941289063e-06, "loss": 0.73580498, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.715721368789673 }, { "auxiliary_loss_clip": 0.01182978, "auxiliary_loss_mlp": 0.00762449, "balance_loss_clip": 1.05692124, "balance_loss_mlp": 1.0003469, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 1.7835221353485522, "language_loss": 0.80036533, "learning_rate": 3.320179541118711e-06, "loss": 0.81981957, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 2.715223550796509 }, { "auxiliary_loss_clip": 0.01074643, "auxiliary_loss_mlp": 0.01003369, "balance_loss_clip": 1.01757157, "balance_loss_mlp": 1.00152159, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 1.00345004022608, "language_loss": 0.60321307, "learning_rate": 3.3195942878383293e-06, "loss": 0.62399316, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 3.1321558952331543 }, { "auxiliary_loss_clip": 0.01186527, "auxiliary_loss_mlp": 0.01037681, "balance_loss_clip": 1.06054354, "balance_loss_mlp": 1.02857351, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 3.814591903988737, "language_loss": 0.777349, "learning_rate": 3.319008834376543e-06, "loss": 0.79959106, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.5396580696105957 }, { "auxiliary_loss_clip": 0.01158888, "auxiliary_loss_mlp": 0.01028754, "balance_loss_clip": 1.04928637, "balance_loss_mlp": 1.01960456, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 2.6140770324049103, "language_loss": 0.8864547, "learning_rate": 3.3184231808221654e-06, "loss": 0.90833116, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.5725255012512207 }, { "auxiliary_loss_clip": 0.01156316, "auxiliary_loss_mlp": 0.0103427, "balance_loss_clip": 1.05384183, "balance_loss_mlp": 1.02479327, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 2.278607087779395, "language_loss": 0.62422669, "learning_rate": 3.3178373272640394e-06, "loss": 0.64613259, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 4.107103109359741 }, { "auxiliary_loss_clip": 0.01197018, "auxiliary_loss_mlp": 0.01030435, "balance_loss_clip": 1.05929971, "balance_loss_mlp": 1.02194166, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 2.402536829158671, "language_loss": 0.84483981, "learning_rate": 3.3172512737910387e-06, "loss": 0.86711442, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 2.4915659427642822 }, { "auxiliary_loss_clip": 0.01183612, "auxiliary_loss_mlp": 0.01037392, "balance_loss_clip": 1.05524182, "balance_loss_mlp": 1.02842748, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 1.9839286344219806, "language_loss": 0.87983799, "learning_rate": 3.3166650204920674e-06, "loss": 0.90204799, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 3.404224157333374 }, { "auxiliary_loss_clip": 0.01185194, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.05863845, "balance_loss_mlp": 1.02194989, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.699245774308353, "language_loss": 0.81571299, "learning_rate": 3.316078567456059e-06, "loss": 0.83787799, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 2.5440163612365723 }, { "auxiliary_loss_clip": 0.01132734, "auxiliary_loss_mlp": 0.01040677, "balance_loss_clip": 1.05131006, "balance_loss_mlp": 1.03196275, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 2.1008481464579516, "language_loss": 0.75592458, "learning_rate": 3.3154919147719786e-06, "loss": 0.7776587, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 2.6706416606903076 }, { "auxiliary_loss_clip": 0.01183594, "auxiliary_loss_mlp": 0.01032588, "balance_loss_clip": 1.05680084, "balance_loss_mlp": 1.02371836, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 2.503933893656644, "language_loss": 0.85954106, "learning_rate": 3.31490506252882e-06, "loss": 0.88170284, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 2.5020763874053955 }, { "auxiliary_loss_clip": 0.01146298, "auxiliary_loss_mlp": 0.01038992, "balance_loss_clip": 1.04910517, "balance_loss_mlp": 1.03021216, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 1.92249498911603, "language_loss": 0.83540368, "learning_rate": 3.31431801081561e-06, "loss": 0.85725653, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.5887837409973145 }, { "auxiliary_loss_clip": 0.01075814, "auxiliary_loss_mlp": 0.01002401, "balance_loss_clip": 1.03314924, "balance_loss_mlp": 1.00074387, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 0.9082784831042658, "language_loss": 0.67917764, "learning_rate": 3.313730759721402e-06, "loss": 0.69995981, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.2620534896850586 }, { "auxiliary_loss_clip": 0.01167475, "auxiliary_loss_mlp": 0.01040118, "balance_loss_clip": 1.05678678, "balance_loss_mlp": 1.03167796, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 1.8678653541792958, "language_loss": 0.86396754, "learning_rate": 3.313143309335282e-06, "loss": 0.88604349, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.530712604522705 }, { "auxiliary_loss_clip": 0.01152829, "auxiliary_loss_mlp": 0.01030546, "balance_loss_clip": 1.05355835, "balance_loss_mlp": 1.02137852, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 1.8895841324958564, "language_loss": 0.84930515, "learning_rate": 3.3125556597463665e-06, "loss": 0.87113881, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.5981037616729736 }, { "auxiliary_loss_clip": 0.01185379, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.05999565, "balance_loss_mlp": 1.03084469, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 2.155739762763418, "language_loss": 0.66615808, "learning_rate": 3.311967811043801e-06, "loss": 0.68840295, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.58964467048645 }, { "auxiliary_loss_clip": 0.01184652, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.0597589, "balance_loss_mlp": 1.02924728, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 2.0215629468298064, "language_loss": 0.82002008, "learning_rate": 3.3113797633167617e-06, "loss": 0.84224552, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.5618627071380615 }, { "auxiliary_loss_clip": 0.01198472, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.05867612, "balance_loss_mlp": 1.02313459, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 2.0338429723827662, "language_loss": 0.68589652, "learning_rate": 3.310791516654455e-06, "loss": 0.70819944, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.510129928588867 }, { "auxiliary_loss_clip": 0.01158785, "auxiliary_loss_mlp": 0.0103798, "balance_loss_clip": 1.05226517, "balance_loss_mlp": 1.02893829, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 1.9200950876062715, "language_loss": 0.79376876, "learning_rate": 3.3102030711461177e-06, "loss": 0.81573641, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.5894017219543457 }, { "auxiliary_loss_clip": 0.01157068, "auxiliary_loss_mlp": 0.01039375, "balance_loss_clip": 1.05295348, "balance_loss_mlp": 1.02993369, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 2.122811860071899, "language_loss": 0.67900801, "learning_rate": 3.3096144268810156e-06, "loss": 0.70097244, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.561272382736206 }, { "auxiliary_loss_clip": 0.01171769, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.05361807, "balance_loss_mlp": 1.02752209, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 1.8571043711220467, "language_loss": 0.72882128, "learning_rate": 3.3090255839484462e-06, "loss": 0.75090605, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.5463149547576904 }, { "auxiliary_loss_clip": 0.01167173, "auxiliary_loss_mlp": 0.01032145, "balance_loss_clip": 1.05220604, "balance_loss_mlp": 1.02353811, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 1.8691923021265149, "language_loss": 0.85358196, "learning_rate": 3.3084365424377366e-06, "loss": 0.87557518, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.5425808429718018 }, { "auxiliary_loss_clip": 0.01048809, "auxiliary_loss_mlp": 0.01002979, "balance_loss_clip": 1.02432251, "balance_loss_mlp": 1.00144124, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 1.2123634580202236, "language_loss": 0.56001246, "learning_rate": 3.307847302438245e-06, "loss": 0.58053029, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.0884931087493896 }, { "auxiliary_loss_clip": 0.01122085, "auxiliary_loss_mlp": 0.01037675, "balance_loss_clip": 1.04489481, "balance_loss_mlp": 1.02823925, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 2.088588009301697, "language_loss": 0.77563035, "learning_rate": 3.3072578640393562e-06, "loss": 0.79722792, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.594249725341797 }, { "auxiliary_loss_clip": 0.01170135, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.05491805, "balance_loss_mlp": 1.02315974, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 2.1030331415416788, "language_loss": 0.79631305, "learning_rate": 3.3066682273304886e-06, "loss": 0.81833136, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 3.252758026123047 }, { "auxiliary_loss_clip": 0.01187842, "auxiliary_loss_mlp": 0.007628, "balance_loss_clip": 1.05767989, "balance_loss_mlp": 1.00032735, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 1.8979001851550605, "language_loss": 0.78561115, "learning_rate": 3.3060783924010904e-06, "loss": 0.80511755, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 2.5073962211608887 }, { "auxiliary_loss_clip": 0.0115496, "auxiliary_loss_mlp": 0.01032481, "balance_loss_clip": 1.05314028, "balance_loss_mlp": 1.02370071, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 2.0805814026230975, "language_loss": 0.85043514, "learning_rate": 3.3054883593406387e-06, "loss": 0.87230957, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.5773870944976807 }, { "auxiliary_loss_clip": 0.01173291, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.05480123, "balance_loss_mlp": 1.02513981, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 2.2764640571841888, "language_loss": 0.65365601, "learning_rate": 3.3048981282386404e-06, "loss": 0.67572874, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 2.6222801208496094 }, { "auxiliary_loss_clip": 0.01141152, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.05137539, "balance_loss_mlp": 1.0250771, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 3.719711829924088, "language_loss": 0.82628119, "learning_rate": 3.304307699184634e-06, "loss": 0.84803605, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 2.5909266471862793 }, { "auxiliary_loss_clip": 0.01171326, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.05718517, "balance_loss_mlp": 1.02958894, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 1.7349819985106545, "language_loss": 0.7889744, "learning_rate": 3.3037170722681866e-06, "loss": 0.81107306, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 2.590567111968994 }, { "auxiliary_loss_clip": 0.01148109, "auxiliary_loss_mlp": 0.01032396, "balance_loss_clip": 1.05330038, "balance_loss_mlp": 1.02321124, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 1.77757326079078, "language_loss": 0.67758799, "learning_rate": 3.3031262475788956e-06, "loss": 0.69939303, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.543487548828125 }, { "auxiliary_loss_clip": 0.01166042, "auxiliary_loss_mlp": 0.01025207, "balance_loss_clip": 1.05477631, "balance_loss_mlp": 1.01691031, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 1.634353352880499, "language_loss": 0.72798067, "learning_rate": 3.3025352252063897e-06, "loss": 0.74989319, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 3.3852343559265137 }, { "auxiliary_loss_clip": 0.01185023, "auxiliary_loss_mlp": 0.01029853, "balance_loss_clip": 1.05919933, "balance_loss_mlp": 1.02192569, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 1.9256520172860376, "language_loss": 0.74570453, "learning_rate": 3.3019440052403252e-06, "loss": 0.76785326, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.545198440551758 }, { "auxiliary_loss_clip": 0.0117092, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.05634141, "balance_loss_mlp": 1.02460766, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 2.0967880326168995, "language_loss": 0.70839697, "learning_rate": 3.30135258777039e-06, "loss": 0.73043966, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 3.280278444290161 }, { "auxiliary_loss_clip": 0.01186439, "auxiliary_loss_mlp": 0.00762591, "balance_loss_clip": 1.05541146, "balance_loss_mlp": 1.00031757, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 1.9750850652919807, "language_loss": 0.70132291, "learning_rate": 3.3007609728863024e-06, "loss": 0.72081327, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 2.514265775680542 }, { "auxiliary_loss_clip": 0.01123416, "auxiliary_loss_mlp": 0.01031433, "balance_loss_clip": 1.05566669, "balance_loss_mlp": 1.02278435, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 1.7932924768866525, "language_loss": 0.73105145, "learning_rate": 3.300169160677809e-06, "loss": 0.7525999, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 2.7565760612487793 }, { "auxiliary_loss_clip": 0.01165384, "auxiliary_loss_mlp": 0.01033375, "balance_loss_clip": 1.05672431, "balance_loss_mlp": 1.02387428, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 2.6386223810522362, "language_loss": 0.77660775, "learning_rate": 3.2995771512346878e-06, "loss": 0.79859537, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 2.5952529907226562 }, { "auxiliary_loss_clip": 0.01201767, "auxiliary_loss_mlp": 0.00762383, "balance_loss_clip": 1.06075978, "balance_loss_mlp": 1.0003413, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 2.3795122361842105, "language_loss": 0.73470116, "learning_rate": 3.298984944646746e-06, "loss": 0.75434268, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.4656336307525635 }, { "auxiliary_loss_clip": 0.01187394, "auxiliary_loss_mlp": 0.00761826, "balance_loss_clip": 1.0599314, "balance_loss_mlp": 1.00036609, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 1.859371354096997, "language_loss": 0.81915569, "learning_rate": 3.298392541003822e-06, "loss": 0.83864784, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.562732696533203 }, { "auxiliary_loss_clip": 0.01165706, "auxiliary_loss_mlp": 0.01034319, "balance_loss_clip": 1.05425835, "balance_loss_mlp": 1.02629662, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.7238899784871047, "language_loss": 0.89789903, "learning_rate": 3.2977999403957806e-06, "loss": 0.91989928, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 2.5340116024017334 }, { "auxiliary_loss_clip": 0.01200573, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.06088161, "balance_loss_mlp": 1.02548122, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 2.017282103090687, "language_loss": 0.67053419, "learning_rate": 3.2972071429125207e-06, "loss": 0.69288474, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.610945463180542 }, { "auxiliary_loss_clip": 0.01154294, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.05513227, "balance_loss_mlp": 1.02459741, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 2.0443086939334796, "language_loss": 0.88290483, "learning_rate": 3.2966141486439682e-06, "loss": 0.90478641, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.5556628704071045 }, { "auxiliary_loss_clip": 0.0112616, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.04613543, "balance_loss_mlp": 1.02529693, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 2.485067868808452, "language_loss": 0.63920689, "learning_rate": 3.29602095768008e-06, "loss": 0.66082269, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.7373788356781006 }, { "auxiliary_loss_clip": 0.01166087, "auxiliary_loss_mlp": 0.01032143, "balance_loss_clip": 1.05825901, "balance_loss_mlp": 1.02371454, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 1.8789819343648704, "language_loss": 0.63824511, "learning_rate": 3.2954275701108437e-06, "loss": 0.66022742, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.6426918506622314 }, { "auxiliary_loss_clip": 0.01134998, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.05010724, "balance_loss_mlp": 1.02127886, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 1.7477815406723394, "language_loss": 0.684232, "learning_rate": 3.294833986026275e-06, "loss": 0.70588565, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 2.7863945960998535 }, { "auxiliary_loss_clip": 0.01149097, "auxiliary_loss_mlp": 0.01036658, "balance_loss_clip": 1.05248344, "balance_loss_mlp": 1.02708507, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 2.028121219475763, "language_loss": 0.85000437, "learning_rate": 3.29424020551642e-06, "loss": 0.87186193, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.5883164405822754 }, { "auxiliary_loss_clip": 0.0120539, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.06282187, "balance_loss_mlp": 1.02922869, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 1.8455583514008382, "language_loss": 0.71600235, "learning_rate": 3.2936462286713546e-06, "loss": 0.73845673, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.5126779079437256 }, { "auxiliary_loss_clip": 0.01186866, "auxiliary_loss_mlp": 0.01038438, "balance_loss_clip": 1.05916047, "balance_loss_mlp": 1.02896106, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 2.168664004138645, "language_loss": 0.77701133, "learning_rate": 3.2930520555811846e-06, "loss": 0.79926431, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.5531508922576904 }, { "auxiliary_loss_clip": 0.01088009, "auxiliary_loss_mlp": 0.00762774, "balance_loss_clip": 1.04468942, "balance_loss_mlp": 1.00039458, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 1.8739427548844902, "language_loss": 0.79995179, "learning_rate": 3.292457686336046e-06, "loss": 0.81845963, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.7011053562164307 }, { "auxiliary_loss_clip": 0.01067681, "auxiliary_loss_mlp": 0.01004298, "balance_loss_clip": 1.01993132, "balance_loss_mlp": 1.00243878, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.9166928983129429, "language_loss": 0.61254323, "learning_rate": 3.291863121026105e-06, "loss": 0.63326299, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.223708391189575 }, { "auxiliary_loss_clip": 0.01184399, "auxiliary_loss_mlp": 0.01038763, "balance_loss_clip": 1.05859339, "balance_loss_mlp": 1.02901816, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 2.6047029110385047, "language_loss": 0.76770616, "learning_rate": 3.2912683597415547e-06, "loss": 0.78993779, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 3.4659886360168457 }, { "auxiliary_loss_clip": 0.01159052, "auxiliary_loss_mlp": 0.01034755, "balance_loss_clip": 1.05394769, "balance_loss_mlp": 1.02549219, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 2.5332789302447756, "language_loss": 0.78195775, "learning_rate": 3.2906734025726213e-06, "loss": 0.80389583, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 2.6735920906066895 }, { "auxiliary_loss_clip": 0.01191954, "auxiliary_loss_mlp": 0.01037516, "balance_loss_clip": 1.05945182, "balance_loss_mlp": 1.0280447, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 2.292629247867991, "language_loss": 0.88564754, "learning_rate": 3.290078249609559e-06, "loss": 0.9079423, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.526149034500122 }, { "auxiliary_loss_clip": 0.01181912, "auxiliary_loss_mlp": 0.01040473, "balance_loss_clip": 1.05900955, "balance_loss_mlp": 1.03156209, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 1.9866865635194229, "language_loss": 0.88038599, "learning_rate": 3.2894829009426514e-06, "loss": 0.90260988, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.550354242324829 }, { "auxiliary_loss_clip": 0.01181505, "auxiliary_loss_mlp": 0.01028485, "balance_loss_clip": 1.05770254, "balance_loss_mlp": 1.02009225, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 1.847372081028074, "language_loss": 0.77599275, "learning_rate": 3.288887356662213e-06, "loss": 0.79809266, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 2.545795440673828 }, { "auxiliary_loss_clip": 0.01069384, "auxiliary_loss_mlp": 0.01002037, "balance_loss_clip": 1.01728976, "balance_loss_mlp": 1.0003798, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7731265833124629, "language_loss": 0.59729171, "learning_rate": 3.288291616858588e-06, "loss": 0.61800593, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 2.98641037940979 }, { "auxiliary_loss_clip": 0.01134106, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.05414724, "balance_loss_mlp": 1.02474892, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 1.8516763966653518, "language_loss": 0.76823449, "learning_rate": 3.287695681622149e-06, "loss": 0.78990579, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.6198558807373047 }, { "auxiliary_loss_clip": 0.01175706, "auxiliary_loss_mlp": 0.01035842, "balance_loss_clip": 1.05728245, "balance_loss_mlp": 1.02670467, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 1.9817982528549574, "language_loss": 0.807886, "learning_rate": 3.2870995510432982e-06, "loss": 0.83000147, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 4.035360336303711 }, { "auxiliary_loss_clip": 0.01174938, "auxiliary_loss_mlp": 0.01029387, "balance_loss_clip": 1.05469251, "balance_loss_mlp": 1.02095246, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 1.9164718326331367, "language_loss": 0.7698313, "learning_rate": 3.2865032252124697e-06, "loss": 0.79187447, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 2.575758695602417 }, { "auxiliary_loss_clip": 0.01165216, "auxiliary_loss_mlp": 0.01029461, "balance_loss_clip": 1.05098987, "balance_loss_mlp": 1.02100313, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 1.4667192511036746, "language_loss": 0.77692902, "learning_rate": 3.2859067042201243e-06, "loss": 0.79887581, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 3.4615354537963867 }, { "auxiliary_loss_clip": 0.01105599, "auxiliary_loss_mlp": 0.01028291, "balance_loss_clip": 1.04866779, "balance_loss_mlp": 1.01966047, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 2.114510171439667, "language_loss": 0.78557283, "learning_rate": 3.2853099881567544e-06, "loss": 0.80691177, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 2.616880178451538 }, { "auxiliary_loss_clip": 0.01193165, "auxiliary_loss_mlp": 0.01033258, "balance_loss_clip": 1.05818331, "balance_loss_mlp": 1.02488947, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 1.7478453414963975, "language_loss": 0.7895276, "learning_rate": 3.284713077112881e-06, "loss": 0.8117919, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.5164763927459717 }, { "auxiliary_loss_clip": 0.01162148, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.05704916, "balance_loss_mlp": 1.02520204, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 11.249026507481148, "language_loss": 0.86752284, "learning_rate": 3.284115971179056e-06, "loss": 0.88948613, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 2.4941508769989014 }, { "auxiliary_loss_clip": 0.01129419, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.05211735, "balance_loss_mlp": 1.02447343, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 1.8699477178189492, "language_loss": 0.78707057, "learning_rate": 3.283518670445859e-06, "loss": 0.80869722, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.6393706798553467 }, { "auxiliary_loss_clip": 0.01054034, "auxiliary_loss_mlp": 0.00752488, "balance_loss_clip": 1.01771498, "balance_loss_mlp": 1.0003413, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.6796238380858755, "language_loss": 0.5425874, "learning_rate": 3.2829211750038995e-06, "loss": 0.56065267, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.2677879333496094 }, { "auxiliary_loss_clip": 0.01146423, "auxiliary_loss_mlp": 0.01029227, "balance_loss_clip": 1.05059052, "balance_loss_mlp": 1.01972067, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 1.7644545747916844, "language_loss": 0.89033282, "learning_rate": 3.2823234849438183e-06, "loss": 0.91208935, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.5672407150268555 }, { "auxiliary_loss_clip": 0.01172927, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.05780172, "balance_loss_mlp": 1.02385163, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 1.9767833704363609, "language_loss": 0.75908834, "learning_rate": 3.2817256003562836e-06, "loss": 0.78114378, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.5657405853271484 }, { "auxiliary_loss_clip": 0.01127885, "auxiliary_loss_mlp": 0.01038501, "balance_loss_clip": 1.05136549, "balance_loss_mlp": 1.02963805, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 1.8830214502412808, "language_loss": 0.6592325, "learning_rate": 3.281127521331995e-06, "loss": 0.6808964, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.714897632598877 }, { "auxiliary_loss_clip": 0.010846, "auxiliary_loss_mlp": 0.01003251, "balance_loss_clip": 1.01983929, "balance_loss_mlp": 1.00155854, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8829780417685199, "language_loss": 0.6064657, "learning_rate": 3.2805292479616798e-06, "loss": 0.62734425, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.001973867416382 }, { "auxiliary_loss_clip": 0.01172325, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.0560832, "balance_loss_mlp": 1.02504945, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 2.3060179622942303, "language_loss": 0.91577661, "learning_rate": 3.2799307803360955e-06, "loss": 0.93783909, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.599374294281006 }, { "auxiliary_loss_clip": 0.01195159, "auxiliary_loss_mlp": 0.01034171, "balance_loss_clip": 1.05834711, "balance_loss_mlp": 1.02541542, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 1.527378730121462, "language_loss": 0.81548417, "learning_rate": 3.27933211854603e-06, "loss": 0.83777744, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.512568950653076 }, { "auxiliary_loss_clip": 0.01167227, "auxiliary_loss_mlp": 0.01033772, "balance_loss_clip": 1.05417728, "balance_loss_mlp": 1.02463424, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.7185268514402356, "language_loss": 0.87280363, "learning_rate": 3.278733262682299e-06, "loss": 0.8948136, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.5493764877319336 }, { "auxiliary_loss_clip": 0.01199833, "auxiliary_loss_mlp": 0.01031592, "balance_loss_clip": 1.05951917, "balance_loss_mlp": 1.02304494, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 3.132165855924124, "language_loss": 0.82169616, "learning_rate": 3.2781342128357484e-06, "loss": 0.84401047, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.4934184551239014 }, { "auxiliary_loss_clip": 0.01153029, "auxiliary_loss_mlp": 0.01033517, "balance_loss_clip": 1.05246639, "balance_loss_mlp": 1.025226, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 6.082762872591491, "language_loss": 0.8049134, "learning_rate": 3.2775349690972547e-06, "loss": 0.82677889, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.5580382347106934 }, { "auxiliary_loss_clip": 0.01076298, "auxiliary_loss_mlp": 0.01002128, "balance_loss_clip": 1.02604365, "balance_loss_mlp": 1.0003165, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.7682175658819874, "language_loss": 0.51827741, "learning_rate": 3.276935531557722e-06, "loss": 0.53906167, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.192943811416626 }, { "auxiliary_loss_clip": 0.01143697, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.05334234, "balance_loss_mlp": 1.01952839, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 2.4117771820199168, "language_loss": 0.79473758, "learning_rate": 3.2763359003080837e-06, "loss": 0.81645733, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.5741944313049316 }, { "auxiliary_loss_clip": 0.01060626, "auxiliary_loss_mlp": 0.01002012, "balance_loss_clip": 1.01608419, "balance_loss_mlp": 1.00024819, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.8035525642113569, "language_loss": 0.6243301, "learning_rate": 3.2757360754393047e-06, "loss": 0.64495653, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 3.229503870010376 }, { "auxiliary_loss_clip": 0.01181797, "auxiliary_loss_mlp": 0.01031206, "balance_loss_clip": 1.05653954, "balance_loss_mlp": 1.02274227, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 2.63479311260939, "language_loss": 0.63851678, "learning_rate": 3.2751360570423767e-06, "loss": 0.6606468, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 3.2388031482696533 }, { "auxiliary_loss_clip": 0.0116737, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.05689895, "balance_loss_mlp": 1.03058088, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 1.9913099687783427, "language_loss": 0.75955772, "learning_rate": 3.2745358452083236e-06, "loss": 0.78162956, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.5982401371002197 }, { "auxiliary_loss_clip": 0.01182833, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.05768633, "balance_loss_mlp": 1.02494764, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 1.3697335034048275, "language_loss": 0.82241249, "learning_rate": 3.2739354400281955e-06, "loss": 0.84456819, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 2.539440631866455 }, { "auxiliary_loss_clip": 0.01052113, "auxiliary_loss_mlp": 0.00752646, "balance_loss_clip": 1.01704717, "balance_loss_mlp": 1.00036645, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.8661578115998271, "language_loss": 0.63716054, "learning_rate": 3.2733348415930744e-06, "loss": 0.65520811, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.191734790802002 }, { "auxiliary_loss_clip": 0.01148245, "auxiliary_loss_mlp": 0.01032909, "balance_loss_clip": 1.05212831, "balance_loss_mlp": 1.02471375, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 1.801940441230278, "language_loss": 0.79952371, "learning_rate": 3.27273404999407e-06, "loss": 0.82133526, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 2.6981899738311768 }, { "auxiliary_loss_clip": 0.01062004, "auxiliary_loss_mlp": 0.01011936, "balance_loss_clip": 1.01759267, "balance_loss_mlp": 1.01024365, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.7937731899838413, "language_loss": 0.60533571, "learning_rate": 3.272133065322322e-06, "loss": 0.62607515, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.8573601245880127 }, { "auxiliary_loss_clip": 0.01192817, "auxiliary_loss_mlp": 0.01027446, "balance_loss_clip": 1.05632186, "balance_loss_mlp": 1.01931643, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 1.8612740247388586, "language_loss": 0.79622924, "learning_rate": 3.271531887669e-06, "loss": 0.81843185, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 3.246549606323242 }, { "auxiliary_loss_clip": 0.01142308, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 1.04935098, "balance_loss_mlp": 1.02475858, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.185202647955437, "language_loss": 0.63061851, "learning_rate": 3.2709305171253015e-06, "loss": 0.65238059, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 3.432671070098877 }, { "auxiliary_loss_clip": 0.01182252, "auxiliary_loss_mlp": 0.01040488, "balance_loss_clip": 1.05780625, "balance_loss_mlp": 1.03173208, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 1.9078184259991826, "language_loss": 0.77801979, "learning_rate": 3.2703289537824536e-06, "loss": 0.80024719, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 2.528658151626587 }, { "auxiliary_loss_clip": 0.01143445, "auxiliary_loss_mlp": 0.01035326, "balance_loss_clip": 1.05332184, "balance_loss_mlp": 1.02689147, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 2.4548310164834413, "language_loss": 0.78751862, "learning_rate": 3.269727197731714e-06, "loss": 0.80930638, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 2.5638952255249023 }, { "auxiliary_loss_clip": 0.01136357, "auxiliary_loss_mlp": 0.01033775, "balance_loss_clip": 1.05473781, "balance_loss_mlp": 1.02509093, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 1.5790593283914038, "language_loss": 0.78082377, "learning_rate": 3.269125249064367e-06, "loss": 0.80252516, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.5984466075897217 }, { "auxiliary_loss_clip": 0.01199686, "auxiliary_loss_mlp": 0.01036093, "balance_loss_clip": 1.05958676, "balance_loss_mlp": 1.02689052, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 1.5576144307137136, "language_loss": 0.83020765, "learning_rate": 3.2685231078717297e-06, "loss": 0.85256541, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 2.4865667819976807 }, { "auxiliary_loss_clip": 0.01148391, "auxiliary_loss_mlp": 0.00762641, "balance_loss_clip": 1.05544639, "balance_loss_mlp": 1.00035441, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 1.9020986290103246, "language_loss": 0.7547785, "learning_rate": 3.267920774245145e-06, "loss": 0.77388883, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.6166739463806152 }, { "auxiliary_loss_clip": 0.01188565, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.06115317, "balance_loss_mlp": 1.02735305, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 1.8708976004905744, "language_loss": 0.84409654, "learning_rate": 3.2673182482759876e-06, "loss": 0.86634743, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.525531530380249 }, { "auxiliary_loss_clip": 0.01181621, "auxiliary_loss_mlp": 0.01029863, "balance_loss_clip": 1.05840027, "balance_loss_mlp": 1.02099955, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 2.0359406100803747, "language_loss": 0.66128957, "learning_rate": 3.266715530055659e-06, "loss": 0.68340433, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.496884822845459 }, { "auxiliary_loss_clip": 0.01171481, "auxiliary_loss_mlp": 0.01034556, "balance_loss_clip": 1.05373025, "balance_loss_mlp": 1.02518034, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 1.9916194513682184, "language_loss": 0.80136335, "learning_rate": 3.2661126196755927e-06, "loss": 0.82342374, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.5111122131347656 }, { "auxiliary_loss_clip": 0.01081551, "auxiliary_loss_mlp": 0.01002273, "balance_loss_clip": 1.0172832, "balance_loss_mlp": 1.00052094, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.7819536297835757, "language_loss": 0.55961281, "learning_rate": 3.265509517227248e-06, "loss": 0.58045107, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.0484039783477783 }, { "auxiliary_loss_clip": 0.01166663, "auxiliary_loss_mlp": 0.01032368, "balance_loss_clip": 1.05383778, "balance_loss_mlp": 1.02385604, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 1.6523018370074398, "language_loss": 0.80836821, "learning_rate": 3.264906222802115e-06, "loss": 0.83035851, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.509690999984741 }, { "auxiliary_loss_clip": 0.01198212, "auxiliary_loss_mlp": 0.01029357, "balance_loss_clip": 1.05898213, "balance_loss_mlp": 1.0206306, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 2.0969381309444013, "language_loss": 0.78439939, "learning_rate": 3.264302736491715e-06, "loss": 0.80667502, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.4733009338378906 }, { "auxiliary_loss_clip": 0.01181131, "auxiliary_loss_mlp": 0.01032471, "balance_loss_clip": 1.0604316, "balance_loss_mlp": 1.02381682, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 2.168278587142872, "language_loss": 0.87450337, "learning_rate": 3.263699058387594e-06, "loss": 0.89663947, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.5022695064544678 }, { "auxiliary_loss_clip": 0.01149483, "auxiliary_loss_mlp": 0.01044465, "balance_loss_clip": 1.05073667, "balance_loss_mlp": 1.03564978, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.1181542033417915, "language_loss": 0.90261078, "learning_rate": 3.2630951885813315e-06, "loss": 0.92455029, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 2.5604114532470703 }, { "auxiliary_loss_clip": 0.01166555, "auxiliary_loss_mlp": 0.01028375, "balance_loss_clip": 1.05124426, "balance_loss_mlp": 1.01988101, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 2.1007850549966314, "language_loss": 0.78152549, "learning_rate": 3.262491127164533e-06, "loss": 0.80347478, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.523890972137451 }, { "auxiliary_loss_clip": 0.01173163, "auxiliary_loss_mlp": 0.0076309, "balance_loss_clip": 1.05515623, "balance_loss_mlp": 1.00034738, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.255856340570019, "language_loss": 0.79805803, "learning_rate": 3.2618868742288337e-06, "loss": 0.8174206, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.5040972232818604 }, { "auxiliary_loss_clip": 0.01182388, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.05719423, "balance_loss_mlp": 1.02291715, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 2.017622800504052, "language_loss": 0.72303164, "learning_rate": 3.261282429865899e-06, "loss": 0.74516976, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.483461380004883 }, { "auxiliary_loss_clip": 0.01175824, "auxiliary_loss_mlp": 0.00761999, "balance_loss_clip": 1.05923438, "balance_loss_mlp": 1.00039148, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 1.5953528267098787, "language_loss": 0.72071534, "learning_rate": 3.2606777941674225e-06, "loss": 0.74009359, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.5249242782592773 }, { "auxiliary_loss_clip": 0.01130786, "auxiliary_loss_mlp": 0.01034272, "balance_loss_clip": 1.05267215, "balance_loss_mlp": 1.02540898, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 1.909625361472053, "language_loss": 0.84179604, "learning_rate": 3.2600729672251276e-06, "loss": 0.86344659, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 3.365257740020752 }, { "auxiliary_loss_clip": 0.01198087, "auxiliary_loss_mlp": 0.00762996, "balance_loss_clip": 1.06117153, "balance_loss_mlp": 1.00040221, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 1.850463472491268, "language_loss": 0.65773833, "learning_rate": 3.259467949130765e-06, "loss": 0.67734909, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 2.5522944927215576 }, { "auxiliary_loss_clip": 0.01175266, "auxiliary_loss_mlp": 0.01034426, "balance_loss_clip": 1.06070638, "balance_loss_mlp": 1.02537215, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 2.0144911337319096, "language_loss": 0.82292122, "learning_rate": 3.2588627399761164e-06, "loss": 0.84501815, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.541459798812866 }, { "auxiliary_loss_clip": 0.01169891, "auxiliary_loss_mlp": 0.01033808, "balance_loss_clip": 1.05719399, "balance_loss_mlp": 1.02512908, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 1.599523483125588, "language_loss": 0.70778739, "learning_rate": 3.2582573398529903e-06, "loss": 0.72982442, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 2.552891492843628 }, { "auxiliary_loss_clip": 0.01152113, "auxiliary_loss_mlp": 0.01034635, "balance_loss_clip": 1.05140316, "balance_loss_mlp": 1.02549207, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.4436812299564403, "language_loss": 0.74142241, "learning_rate": 3.2576517488532265e-06, "loss": 0.76328993, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.526862382888794 }, { "auxiliary_loss_clip": 0.01180356, "auxiliary_loss_mlp": 0.01030149, "balance_loss_clip": 1.05485594, "balance_loss_mlp": 1.02172065, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.7330459882854072, "language_loss": 0.87365425, "learning_rate": 3.257045967068692e-06, "loss": 0.89575928, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.5266571044921875 }, { "auxiliary_loss_clip": 0.01198296, "auxiliary_loss_mlp": 0.0103558, "balance_loss_clip": 1.05963683, "balance_loss_mlp": 1.02646089, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.920468654371795, "language_loss": 0.82761389, "learning_rate": 3.2564399945912848e-06, "loss": 0.8499527, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 3.9914872646331787 }, { "auxiliary_loss_clip": 0.01140932, "auxiliary_loss_mlp": 0.01030244, "balance_loss_clip": 1.0504396, "balance_loss_mlp": 1.02242374, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.228797552218803, "language_loss": 0.8238256, "learning_rate": 3.2558338315129287e-06, "loss": 0.8455373, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 2.631718397140503 }, { "auxiliary_loss_clip": 0.0117839, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.05852389, "balance_loss_mlp": 1.03335428, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 2.6107476069767293, "language_loss": 0.75773156, "learning_rate": 3.2552274779255785e-06, "loss": 0.77994066, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 3.440293073654175 }, { "auxiliary_loss_clip": 0.01183047, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.05930781, "balance_loss_mlp": 1.02758121, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 2.038288059235117, "language_loss": 0.76856625, "learning_rate": 3.2546209339212184e-06, "loss": 0.79075849, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.507427930831909 }, { "auxiliary_loss_clip": 0.0116874, "auxiliary_loss_mlp": 0.01033279, "balance_loss_clip": 1.05371165, "balance_loss_mlp": 1.02434397, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 1.724628098492537, "language_loss": 0.77561647, "learning_rate": 3.25401419959186e-06, "loss": 0.79763663, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 2.576265573501587 }, { "auxiliary_loss_clip": 0.0118407, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.06359279, "balance_loss_mlp": 1.02961802, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 1.7725735004470247, "language_loss": 0.76260662, "learning_rate": 3.253407275029545e-06, "loss": 0.78483087, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.529459238052368 }, { "auxiliary_loss_clip": 0.01160519, "auxiliary_loss_mlp": 0.01051837, "balance_loss_clip": 1.05822229, "balance_loss_mlp": 1.04193652, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 1.8135670480119648, "language_loss": 0.80179894, "learning_rate": 3.2528001603263425e-06, "loss": 0.82392251, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 2.617621421813965 }, { "auxiliary_loss_clip": 0.01181743, "auxiliary_loss_mlp": 0.01033662, "balance_loss_clip": 1.05999923, "balance_loss_mlp": 1.02489471, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 2.613676724602542, "language_loss": 0.81630123, "learning_rate": 3.2521928555743514e-06, "loss": 0.83845532, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.507024049758911 }, { "auxiliary_loss_clip": 0.01163451, "auxiliary_loss_mlp": 0.00763142, "balance_loss_clip": 1.0545032, "balance_loss_mlp": 1.00041592, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 2.21806823898555, "language_loss": 0.67127484, "learning_rate": 3.2515853608657e-06, "loss": 0.69054079, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.56483793258667 }, { "auxiliary_loss_clip": 0.01178929, "auxiliary_loss_mlp": 0.01035601, "balance_loss_clip": 1.05679226, "balance_loss_mlp": 1.02654147, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 2.0680961327915406, "language_loss": 0.75716972, "learning_rate": 3.250977676292545e-06, "loss": 0.77931499, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.5004711151123047 }, { "auxiliary_loss_clip": 0.0117219, "auxiliary_loss_mlp": 0.01032254, "balance_loss_clip": 1.05743444, "balance_loss_mlp": 1.02376068, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.1158165630208905, "language_loss": 0.79423428, "learning_rate": 3.2503698019470712e-06, "loss": 0.8162787, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.5124754905700684 }, { "auxiliary_loss_clip": 0.01180804, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.05454206, "balance_loss_mlp": 1.02642262, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 3.0040414726095026, "language_loss": 0.78617847, "learning_rate": 3.249761737921492e-06, "loss": 0.80834585, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.4747815132141113 }, { "auxiliary_loss_clip": 0.0116837, "auxiliary_loss_mlp": 0.01033765, "balance_loss_clip": 1.05945778, "balance_loss_mlp": 1.02576065, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 1.8893010958670406, "language_loss": 0.74109215, "learning_rate": 3.249153484308051e-06, "loss": 0.7631135, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.611985921859741 }, { "auxiliary_loss_clip": 0.0112893, "auxiliary_loss_mlp": 0.0103227, "balance_loss_clip": 1.05196702, "balance_loss_mlp": 1.02314484, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 2.0486994898444397, "language_loss": 0.77772915, "learning_rate": 3.2485450411990194e-06, "loss": 0.7993412, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.5905821323394775 }, { "auxiliary_loss_clip": 0.01198225, "auxiliary_loss_mlp": 0.01037468, "balance_loss_clip": 1.05861413, "balance_loss_mlp": 1.02852178, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 2.9932346671863628, "language_loss": 0.82199484, "learning_rate": 3.2479364086866983e-06, "loss": 0.84435177, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.5804643630981445 }, { "auxiliary_loss_clip": 0.01170804, "auxiliary_loss_mlp": 0.00762814, "balance_loss_clip": 1.05864286, "balance_loss_mlp": 1.00042319, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 1.8612303810835158, "language_loss": 0.81566066, "learning_rate": 3.247327586863416e-06, "loss": 0.83499682, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.575174570083618 }, { "auxiliary_loss_clip": 0.01161849, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.05632138, "balance_loss_mlp": 1.02351451, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 2.113861015884066, "language_loss": 0.76875246, "learning_rate": 3.2467185758215304e-06, "loss": 0.79069191, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.6148173809051514 }, { "auxiliary_loss_clip": 0.01158945, "auxiliary_loss_mlp": 0.00762409, "balance_loss_clip": 1.05702245, "balance_loss_mlp": 1.00041604, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 2.6073137912295334, "language_loss": 0.85361958, "learning_rate": 3.246109375653428e-06, "loss": 0.87283313, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.5625100135803223 }, { "auxiliary_loss_clip": 0.01198068, "auxiliary_loss_mlp": 0.01034281, "balance_loss_clip": 1.05988526, "balance_loss_mlp": 1.02507818, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 1.8380235341051823, "language_loss": 0.78277516, "learning_rate": 3.2454999864515243e-06, "loss": 0.80509865, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.463102340698242 }, { "auxiliary_loss_clip": 0.01160482, "auxiliary_loss_mlp": 0.00762447, "balance_loss_clip": 1.0546912, "balance_loss_mlp": 1.00040495, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 1.6634258851764445, "language_loss": 0.69088495, "learning_rate": 3.244890408308263e-06, "loss": 0.71011424, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.535918951034546 }, { "auxiliary_loss_clip": 0.01142064, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.05249047, "balance_loss_mlp": 1.02312827, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 2.267915880092834, "language_loss": 0.60576057, "learning_rate": 3.2442806413161165e-06, "loss": 0.62749898, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 3.3876137733459473 }, { "auxiliary_loss_clip": 0.01144073, "auxiliary_loss_mlp": 0.01035852, "balance_loss_clip": 1.05376577, "balance_loss_mlp": 1.0264883, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 2.012512699051081, "language_loss": 0.75576341, "learning_rate": 3.243670685567586e-06, "loss": 0.77756262, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 2.555206537246704 }, { "auxiliary_loss_clip": 0.01167069, "auxiliary_loss_mlp": 0.00761814, "balance_loss_clip": 1.05464947, "balance_loss_mlp": 1.00040722, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.158706153934657, "language_loss": 0.80485928, "learning_rate": 3.2430605411552012e-06, "loss": 0.82414818, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.547567844390869 }, { "auxiliary_loss_clip": 0.01057706, "auxiliary_loss_mlp": 0.01006201, "balance_loss_clip": 1.02114463, "balance_loss_mlp": 1.00437737, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.9006923252131376, "language_loss": 0.70524442, "learning_rate": 3.2424502081715205e-06, "loss": 0.72588348, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 3.18129301071167 }, { "auxiliary_loss_clip": 0.01169608, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.05658698, "balance_loss_mlp": 1.02056825, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 1.747616828451794, "language_loss": 0.77967119, "learning_rate": 3.241839686709132e-06, "loss": 0.80166054, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.5585925579071045 }, { "auxiliary_loss_clip": 0.01182333, "auxiliary_loss_mlp": 0.0103549, "balance_loss_clip": 1.05673349, "balance_loss_mlp": 1.02569675, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 2.4862505534369173, "language_loss": 0.81844336, "learning_rate": 3.2412289768606495e-06, "loss": 0.84062159, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.4799702167510986 }, { "auxiliary_loss_clip": 0.01187946, "auxiliary_loss_mlp": 0.01027215, "balance_loss_clip": 1.06035614, "balance_loss_mlp": 1.0187335, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 1.9886474931816278, "language_loss": 0.82627678, "learning_rate": 3.240618078718718e-06, "loss": 0.84842837, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 3.386650323867798 }, { "auxiliary_loss_clip": 0.01148991, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.05012143, "balance_loss_mlp": 1.01930296, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 2.0091404206652763, "language_loss": 0.7415874, "learning_rate": 3.240006992376011e-06, "loss": 0.76336789, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 2.591628074645996 }, { "auxiliary_loss_clip": 0.01173963, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.05764103, "balance_loss_mlp": 1.02825665, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 2.255674596849611, "language_loss": 0.76140141, "learning_rate": 3.2393957179252284e-06, "loss": 0.78350902, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 2.540149211883545 }, { "auxiliary_loss_clip": 0.01202387, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.06331396, "balance_loss_mlp": 1.01974154, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 1.8690733128055979, "language_loss": 0.80316991, "learning_rate": 3.2387842554591016e-06, "loss": 0.82548106, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 3.363516330718994 }, { "auxiliary_loss_clip": 0.01200829, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.06093061, "balance_loss_mlp": 1.02927351, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 2.213299967552078, "language_loss": 0.87851357, "learning_rate": 3.238172605070388e-06, "loss": 0.90090919, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.441365957260132 }, { "auxiliary_loss_clip": 0.01183529, "auxiliary_loss_mlp": 0.00763253, "balance_loss_clip": 1.05926013, "balance_loss_mlp": 1.0003866, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 2.4240794206290444, "language_loss": 0.78073436, "learning_rate": 3.2375607668518745e-06, "loss": 0.80020213, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 2.4520339965820312 }, { "auxiliary_loss_clip": 0.01162413, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.056445, "balance_loss_mlp": 1.02807188, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.070592753815051, "language_loss": 0.89806199, "learning_rate": 3.236948740896377e-06, "loss": 0.92005527, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 2.5365450382232666 }, { "auxiliary_loss_clip": 0.011845, "auxiliary_loss_mlp": 0.01035248, "balance_loss_clip": 1.05976129, "balance_loss_mlp": 1.02544296, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 1.6345415454869123, "language_loss": 0.84388793, "learning_rate": 3.2363365272967384e-06, "loss": 0.86608541, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.610645055770874 }, { "auxiliary_loss_clip": 0.01184935, "auxiliary_loss_mlp": 0.01036437, "balance_loss_clip": 1.06204081, "balance_loss_mlp": 1.02686501, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 1.960744900784151, "language_loss": 0.81505418, "learning_rate": 3.235724126145832e-06, "loss": 0.83726794, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.4930341243743896 }, { "auxiliary_loss_clip": 0.01171881, "auxiliary_loss_mlp": 0.01032802, "balance_loss_clip": 1.05516481, "balance_loss_mlp": 1.0237484, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.5050249768349608, "language_loss": 0.77745706, "learning_rate": 3.235111537536558e-06, "loss": 0.79950392, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.524223566055298 }, { "auxiliary_loss_clip": 0.01184952, "auxiliary_loss_mlp": 0.0102691, "balance_loss_clip": 1.05885553, "balance_loss_mlp": 1.01810682, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 1.9214351489062536, "language_loss": 0.82769883, "learning_rate": 3.2344987615618456e-06, "loss": 0.84981745, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.5329270362854004 }, { "auxiliary_loss_clip": 0.01155351, "auxiliary_loss_mlp": 0.01033813, "balance_loss_clip": 1.05764699, "balance_loss_mlp": 1.02538466, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.9556408980930045, "language_loss": 0.78194308, "learning_rate": 3.2338857983146533e-06, "loss": 0.8038348, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.677990674972534 }, { "auxiliary_loss_clip": 0.01163062, "auxiliary_loss_mlp": 0.01036515, "balance_loss_clip": 1.05879855, "balance_loss_mlp": 1.02713919, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 1.760467006113623, "language_loss": 0.76553208, "learning_rate": 3.233272647887966e-06, "loss": 0.7875278, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.530245542526245 }, { "auxiliary_loss_clip": 0.01199314, "auxiliary_loss_mlp": 0.01030704, "balance_loss_clip": 1.06124723, "balance_loss_mlp": 1.02198398, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 1.6250165243498411, "language_loss": 0.9014411, "learning_rate": 3.2326593103747985e-06, "loss": 0.92374122, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.4958388805389404 }, { "auxiliary_loss_clip": 0.01184279, "auxiliary_loss_mlp": 0.01038896, "balance_loss_clip": 1.06096125, "balance_loss_mlp": 1.03059363, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 2.186232622402892, "language_loss": 0.85363328, "learning_rate": 3.2320457858681936e-06, "loss": 0.87586498, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.487229108810425 }, { "auxiliary_loss_clip": 0.01167997, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.05534267, "balance_loss_mlp": 1.03137553, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 2.7059933582434654, "language_loss": 0.85236573, "learning_rate": 3.2314320744612228e-06, "loss": 0.87444377, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.5190415382385254 }, { "auxiliary_loss_clip": 0.0118183, "auxiliary_loss_mlp": 0.01032724, "balance_loss_clip": 1.05926776, "balance_loss_mlp": 1.02445078, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 1.5164846884592809, "language_loss": 0.76435959, "learning_rate": 3.2308181762469854e-06, "loss": 0.78650516, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.4898293018341064 }, { "auxiliary_loss_clip": 0.01204998, "auxiliary_loss_mlp": 0.01045996, "balance_loss_clip": 1.06319666, "balance_loss_mlp": 1.03603017, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 2.1156745469573432, "language_loss": 0.78442025, "learning_rate": 3.230204091318609e-06, "loss": 0.80693024, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.536320447921753 }, { "auxiliary_loss_clip": 0.011962, "auxiliary_loss_mlp": 0.00762368, "balance_loss_clip": 1.05881882, "balance_loss_mlp": 1.00037527, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 2.125422123524982, "language_loss": 0.84495246, "learning_rate": 3.2295898197692503e-06, "loss": 0.86453819, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.4711720943450928 }, { "auxiliary_loss_clip": 0.01196826, "auxiliary_loss_mlp": 0.01030056, "balance_loss_clip": 1.05853045, "balance_loss_mlp": 1.02132416, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 1.6306482694965112, "language_loss": 0.79127759, "learning_rate": 3.228975361692094e-06, "loss": 0.81354636, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 3.3604116439819336 }, { "auxiliary_loss_clip": 0.01187555, "auxiliary_loss_mlp": 0.0076323, "balance_loss_clip": 1.05737114, "balance_loss_mlp": 1.0003922, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.1715454412058417, "language_loss": 0.80287743, "learning_rate": 3.228360717180352e-06, "loss": 0.82238531, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 2.521383047103882 }, { "auxiliary_loss_clip": 0.01084792, "auxiliary_loss_mlp": 0.00752521, "balance_loss_clip": 1.02096081, "balance_loss_mlp": 1.00038707, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.8340359122380687, "language_loss": 0.59446681, "learning_rate": 3.227745886327266e-06, "loss": 0.61283994, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.0363762378692627 }, { "auxiliary_loss_clip": 0.01085487, "auxiliary_loss_mlp": 0.01014345, "balance_loss_clip": 1.02131474, "balance_loss_mlp": 1.01268756, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.8143845582890822, "language_loss": 0.55838692, "learning_rate": 3.227130869226105e-06, "loss": 0.57938522, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.05106258392334 }, { "auxiliary_loss_clip": 0.01183869, "auxiliary_loss_mlp": 0.01030405, "balance_loss_clip": 1.0571394, "balance_loss_mlp": 1.02188742, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 2.468970215951078, "language_loss": 0.82798207, "learning_rate": 3.226515665970167e-06, "loss": 0.85012478, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 2.5338547229766846 }, { "auxiliary_loss_clip": 0.01179685, "auxiliary_loss_mlp": 0.01037407, "balance_loss_clip": 1.05637288, "balance_loss_mlp": 1.02788877, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 2.3552037820621528, "language_loss": 0.86643624, "learning_rate": 3.225900276652777e-06, "loss": 0.8886072, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.470212697982788 }, { "auxiliary_loss_clip": 0.01176611, "auxiliary_loss_mlp": 0.01028521, "balance_loss_clip": 1.05768454, "balance_loss_mlp": 1.01938939, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 1.5958811305156895, "language_loss": 0.75541049, "learning_rate": 3.2252847013672906e-06, "loss": 0.77746189, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.618248224258423 }, { "auxiliary_loss_clip": 0.01145971, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.05298853, "balance_loss_mlp": 1.02383578, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 1.8960331274087485, "language_loss": 0.75863194, "learning_rate": 3.224668940207089e-06, "loss": 0.78041464, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 4.065414190292358 }, { "auxiliary_loss_clip": 0.01132102, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.04990804, "balance_loss_mlp": 1.02453053, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 2.3134378026421927, "language_loss": 0.86128932, "learning_rate": 3.2240529932655828e-06, "loss": 0.88295192, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 2.6548502445220947 }, { "auxiliary_loss_clip": 0.01169015, "auxiliary_loss_mlp": 0.01027712, "balance_loss_clip": 1.05872762, "balance_loss_mlp": 1.0191766, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 2.6660959686064594, "language_loss": 0.88255298, "learning_rate": 3.223436860636211e-06, "loss": 0.90452015, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.5286741256713867 }, { "auxiliary_loss_clip": 0.01199401, "auxiliary_loss_mlp": 0.01029395, "balance_loss_clip": 1.06186783, "balance_loss_mlp": 1.02081227, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.548227437678018, "language_loss": 0.73999971, "learning_rate": 3.2228205424124403e-06, "loss": 0.76228762, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 3.3637282848358154 }, { "auxiliary_loss_clip": 0.01153119, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.05186605, "balance_loss_mlp": 1.02512956, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 2.2025592442268214, "language_loss": 0.74561393, "learning_rate": 3.222204038687765e-06, "loss": 0.76748437, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.65503191947937 }, { "auxiliary_loss_clip": 0.0118014, "auxiliary_loss_mlp": 0.01024575, "balance_loss_clip": 1.05810118, "balance_loss_mlp": 1.01636124, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.6037843509077017, "language_loss": 0.88446277, "learning_rate": 3.221587349555709e-06, "loss": 0.90650988, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 2.5648632049560547 }, { "auxiliary_loss_clip": 0.01171159, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.05639791, "balance_loss_mlp": 1.02736545, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 1.5893122690443362, "language_loss": 0.69436681, "learning_rate": 3.2209704751098236e-06, "loss": 0.71643412, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 2.5491015911102295 }, { "auxiliary_loss_clip": 0.01171617, "auxiliary_loss_mlp": 0.01036219, "balance_loss_clip": 1.05764306, "balance_loss_mlp": 1.02753484, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 2.0083228241920237, "language_loss": 0.82886815, "learning_rate": 3.2203534154436875e-06, "loss": 0.85094643, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.4859020709991455 }, { "auxiliary_loss_clip": 0.01120067, "auxiliary_loss_mlp": 0.01034759, "balance_loss_clip": 1.04914236, "balance_loss_mlp": 1.02587771, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 1.929881430816264, "language_loss": 0.75656509, "learning_rate": 3.2197361706509084e-06, "loss": 0.77811331, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.651862859725952 }, { "auxiliary_loss_clip": 0.01198037, "auxiliary_loss_mlp": 0.0103652, "balance_loss_clip": 1.05908799, "balance_loss_mlp": 1.02704871, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 3.0542169096592104, "language_loss": 0.83419394, "learning_rate": 3.2191187408251228e-06, "loss": 0.85653949, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.428502321243286 }, { "auxiliary_loss_clip": 0.01187345, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.05699444, "balance_loss_mlp": 1.02795172, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 1.973567597144241, "language_loss": 0.79035676, "learning_rate": 3.218501126059993e-06, "loss": 0.8126049, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.4654862880706787 }, { "auxiliary_loss_clip": 0.011797, "auxiliary_loss_mlp": 0.01039187, "balance_loss_clip": 1.05189085, "balance_loss_mlp": 1.03037763, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 2.0196101193473868, "language_loss": 0.81580198, "learning_rate": 3.2178833264492116e-06, "loss": 0.83799088, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.5027902126312256 }, { "auxiliary_loss_clip": 0.01189436, "auxiliary_loss_mlp": 0.01033247, "balance_loss_clip": 1.05938458, "balance_loss_mlp": 1.02432442, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 1.8364446185421885, "language_loss": 0.76316798, "learning_rate": 3.217265342086498e-06, "loss": 0.78539479, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.580620765686035 }, { "auxiliary_loss_clip": 0.01157432, "auxiliary_loss_mlp": 0.00763305, "balance_loss_clip": 1.05653739, "balance_loss_mlp": 1.0003705, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 2.2594832086898475, "language_loss": 0.73423529, "learning_rate": 3.216647173065599e-06, "loss": 0.75344265, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.5388145446777344 }, { "auxiliary_loss_clip": 0.01171201, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.06009316, "balance_loss_mlp": 1.02734864, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 1.7390544968430852, "language_loss": 0.7379992, "learning_rate": 3.216028819480292e-06, "loss": 0.76007318, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.78756046295166 }, { "auxiliary_loss_clip": 0.01154863, "auxiliary_loss_mlp": 0.0103104, "balance_loss_clip": 1.05536354, "balance_loss_mlp": 1.02292204, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 2.823143289570943, "language_loss": 0.75486314, "learning_rate": 3.2154102814243793e-06, "loss": 0.77672213, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.5432121753692627 }, { "auxiliary_loss_clip": 0.01156079, "auxiliary_loss_mlp": 0.01033865, "balance_loss_clip": 1.05355072, "balance_loss_mlp": 1.02533531, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 1.8815780773325272, "language_loss": 0.66715771, "learning_rate": 3.2147915589916937e-06, "loss": 0.68905711, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.687453031539917 }, { "auxiliary_loss_clip": 0.01159803, "auxiliary_loss_mlp": 0.01040475, "balance_loss_clip": 1.05297351, "balance_loss_mlp": 1.03156447, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 1.7161850040529079, "language_loss": 0.8293196, "learning_rate": 3.2141726522760938e-06, "loss": 0.85132241, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.5403823852539062 }, { "auxiliary_loss_clip": 0.01070148, "auxiliary_loss_mlp": 0.01001901, "balance_loss_clip": 1.01810992, "balance_loss_mlp": 1.00038147, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.7112160536712682, "language_loss": 0.5271278, "learning_rate": 3.213553561371469e-06, "loss": 0.54784828, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.1994080543518066 }, { "auxiliary_loss_clip": 0.01139125, "auxiliary_loss_mlp": 0.01029714, "balance_loss_clip": 1.05596209, "balance_loss_mlp": 1.02164948, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.5657608057211103, "language_loss": 0.95867503, "learning_rate": 3.212934286371733e-06, "loss": 0.98036343, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 3.3194291591644287 }, { "auxiliary_loss_clip": 0.01181141, "auxiliary_loss_mlp": 0.01032643, "balance_loss_clip": 1.05972624, "balance_loss_mlp": 1.02336287, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 2.027615889655219, "language_loss": 0.83288139, "learning_rate": 3.2123148273708304e-06, "loss": 0.85501933, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 2.6463425159454346 }, { "auxiliary_loss_clip": 0.01197835, "auxiliary_loss_mlp": 0.01030489, "balance_loss_clip": 1.06146288, "balance_loss_mlp": 1.02233505, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 2.08020493929946, "language_loss": 0.77134627, "learning_rate": 3.211695184462733e-06, "loss": 0.79362953, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 2.509791374206543 }, { "auxiliary_loss_clip": 0.01049354, "auxiliary_loss_mlp": 0.01001951, "balance_loss_clip": 1.01739073, "balance_loss_mlp": 1.00031805, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.890321072590176, "language_loss": 0.60453182, "learning_rate": 3.2110753577414383e-06, "loss": 0.62504488, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 3.141970157623291 }, { "auxiliary_loss_clip": 0.01170268, "auxiliary_loss_mlp": 0.01038504, "balance_loss_clip": 1.05535674, "balance_loss_mlp": 1.02889609, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 2.2365450570677967, "language_loss": 0.79405403, "learning_rate": 3.2104553473009757e-06, "loss": 0.81614184, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 2.5333120822906494 }, { "auxiliary_loss_clip": 0.01132603, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.05070233, "balance_loss_mlp": 1.02012992, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 1.8656159937816497, "language_loss": 0.67947114, "learning_rate": 3.209835153235399e-06, "loss": 0.70108461, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.694971799850464 }, { "auxiliary_loss_clip": 0.01140705, "auxiliary_loss_mlp": 0.01040729, "balance_loss_clip": 1.05084348, "balance_loss_mlp": 1.03248, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 1.7812893564095367, "language_loss": 0.67835236, "learning_rate": 3.2092147756387916e-06, "loss": 0.7001667, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.591644763946533 }, { "auxiliary_loss_clip": 0.01158283, "auxiliary_loss_mlp": 0.01036429, "balance_loss_clip": 1.05169749, "balance_loss_mlp": 1.02752399, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 1.7067710260144815, "language_loss": 0.83614421, "learning_rate": 3.208594214605264e-06, "loss": 0.85809135, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 4.0384862422943115 }, { "auxiliary_loss_clip": 0.01154726, "auxiliary_loss_mlp": 0.01029515, "balance_loss_clip": 1.05311823, "balance_loss_mlp": 1.02211857, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 2.9817987484213497, "language_loss": 0.77218896, "learning_rate": 3.2079734702289553e-06, "loss": 0.79403138, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 2.5486955642700195 }, { "auxiliary_loss_clip": 0.01068005, "auxiliary_loss_mlp": 0.00752077, "balance_loss_clip": 1.018471, "balance_loss_mlp": 1.00029206, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8032105608109557, "language_loss": 0.60423696, "learning_rate": 3.207352542604031e-06, "loss": 0.62243783, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 3.1875483989715576 }, { "auxiliary_loss_clip": 0.01137888, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.0488627, "balance_loss_mlp": 1.02907586, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 1.688789880168598, "language_loss": 0.78459257, "learning_rate": 3.2067314318246864e-06, "loss": 0.80633777, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 3.4361112117767334 }, { "auxiliary_loss_clip": 0.01153217, "auxiliary_loss_mlp": 0.01036602, "balance_loss_clip": 1.05605221, "balance_loss_mlp": 1.02753055, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 1.6761251774850259, "language_loss": 0.77978599, "learning_rate": 3.206110137985143e-06, "loss": 0.8016842, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.6272075176239014 }, { "auxiliary_loss_clip": 0.01141856, "auxiliary_loss_mlp": 0.01042495, "balance_loss_clip": 1.05201685, "balance_loss_mlp": 1.03419828, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 1.86622348775602, "language_loss": 0.92265069, "learning_rate": 3.2054886611796505e-06, "loss": 0.94449425, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 2.614384174346924 }, { "auxiliary_loss_clip": 0.0108116, "auxiliary_loss_mlp": 0.01003168, "balance_loss_clip": 1.01829171, "balance_loss_mlp": 1.00165367, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.8849685160866501, "language_loss": 0.63569796, "learning_rate": 3.204867001502487e-06, "loss": 0.65654123, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 3.0761187076568604 }, { "auxiliary_loss_clip": 0.01198412, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.06179595, "balance_loss_mlp": 1.02793646, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 1.7163933315117532, "language_loss": 0.80837202, "learning_rate": 3.2042451590479567e-06, "loss": 0.8307234, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.5493316650390625 }, { "auxiliary_loss_clip": 0.01191862, "auxiliary_loss_mlp": 0.01031722, "balance_loss_clip": 1.05890584, "balance_loss_mlp": 1.02378881, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 1.675538353584152, "language_loss": 0.87117887, "learning_rate": 3.203623133910394e-06, "loss": 0.89341462, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.4917385578155518 }, { "auxiliary_loss_clip": 0.01125117, "auxiliary_loss_mlp": 0.01033732, "balance_loss_clip": 1.04885519, "balance_loss_mlp": 1.02572131, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 3.464416011822075, "language_loss": 0.77414489, "learning_rate": 3.203000926184158e-06, "loss": 0.79573333, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.7675654888153076 }, { "auxiliary_loss_clip": 0.01193828, "auxiliary_loss_mlp": 0.01032746, "balance_loss_clip": 1.05849981, "balance_loss_mlp": 1.02439523, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 1.64914061224499, "language_loss": 0.77796954, "learning_rate": 3.202378535963639e-06, "loss": 0.80023527, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.5715365409851074 }, { "auxiliary_loss_clip": 0.01155921, "auxiliary_loss_mlp": 0.00762502, "balance_loss_clip": 1.05190134, "balance_loss_mlp": 1.00034094, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 1.59793280589802, "language_loss": 0.84087592, "learning_rate": 3.2017559633432516e-06, "loss": 0.86006021, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.6015841960906982 }, { "auxiliary_loss_clip": 0.01173523, "auxiliary_loss_mlp": 0.01032799, "balance_loss_clip": 1.05579281, "balance_loss_mlp": 1.02412665, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 2.1497887832158384, "language_loss": 0.65993404, "learning_rate": 3.2011332084174398e-06, "loss": 0.6819973, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.6395835876464844 }, { "auxiliary_loss_clip": 0.01177637, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.05661488, "balance_loss_mlp": 1.02271152, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.914409029705995, "language_loss": 0.89154434, "learning_rate": 3.2005102712806756e-06, "loss": 0.91363865, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.5515828132629395 }, { "auxiliary_loss_clip": 0.01182769, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.05701363, "balance_loss_mlp": 1.02795768, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 2.5350831691850937, "language_loss": 0.73007977, "learning_rate": 3.1998871520274575e-06, "loss": 0.75227457, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.525724172592163 }, { "auxiliary_loss_clip": 0.01166509, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.05273175, "balance_loss_mlp": 1.02474999, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 1.9071740148140937, "language_loss": 0.85228741, "learning_rate": 3.199263850752312e-06, "loss": 0.87428904, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.612267017364502 }, { "auxiliary_loss_clip": 0.01182211, "auxiliary_loss_mlp": 0.01031869, "balance_loss_clip": 1.05890751, "balance_loss_mlp": 1.02358365, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 2.3301343111931327, "language_loss": 0.85498965, "learning_rate": 3.198640367549795e-06, "loss": 0.87713045, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.5099260807037354 }, { "auxiliary_loss_clip": 0.01180273, "auxiliary_loss_mlp": 0.0076222, "balance_loss_clip": 1.05535388, "balance_loss_mlp": 1.00034976, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.6577391102634011, "language_loss": 0.86008906, "learning_rate": 3.198016702514487e-06, "loss": 0.87951398, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.6008989810943604 }, { "auxiliary_loss_clip": 0.01191448, "auxiliary_loss_mlp": 0.0103135, "balance_loss_clip": 1.05753326, "balance_loss_mlp": 1.02389669, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 1.6940686550640216, "language_loss": 0.84465146, "learning_rate": 3.1973928557409972e-06, "loss": 0.86687946, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.502098321914673 }, { "auxiliary_loss_clip": 0.01190271, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.05749774, "balance_loss_mlp": 1.02861929, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 2.0068180523141437, "language_loss": 0.71295696, "learning_rate": 3.1967688273239636e-06, "loss": 0.73522681, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 3.3501522541046143 }, { "auxiliary_loss_clip": 0.01150769, "auxiliary_loss_mlp": 0.01024294, "balance_loss_clip": 1.05323672, "balance_loss_mlp": 1.01638103, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 2.638067879483366, "language_loss": 0.82267773, "learning_rate": 3.1961446173580503e-06, "loss": 0.84442842, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 2.5423877239227295 }, { "auxiliary_loss_clip": 0.01164621, "auxiliary_loss_mlp": 0.01034778, "balance_loss_clip": 1.05633152, "balance_loss_mlp": 1.02626014, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 1.668951798371586, "language_loss": 0.77542055, "learning_rate": 3.1955202259379502e-06, "loss": 0.79741454, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 2.623281240463257 }, { "auxiliary_loss_clip": 0.01178847, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.05596209, "balance_loss_mlp": 1.02428567, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 1.8602806066028563, "language_loss": 0.8308391, "learning_rate": 3.194895653158381e-06, "loss": 0.85296011, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 2.5842666625976562 }, { "auxiliary_loss_clip": 0.010813, "auxiliary_loss_mlp": 0.01003439, "balance_loss_clip": 1.0184083, "balance_loss_mlp": 1.00191927, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 0.7765355675301934, "language_loss": 0.55556881, "learning_rate": 3.194270899114093e-06, "loss": 0.57641613, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.195396900177002 }, { "auxiliary_loss_clip": 0.01187455, "auxiliary_loss_mlp": 0.01042995, "balance_loss_clip": 1.05870187, "balance_loss_mlp": 1.03431654, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 1.8181757950168371, "language_loss": 0.81895441, "learning_rate": 3.193645963899858e-06, "loss": 0.84125888, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 2.528512477874756 }, { "auxiliary_loss_clip": 0.01159542, "auxiliary_loss_mlp": 0.01031409, "balance_loss_clip": 1.05323005, "balance_loss_mlp": 1.02257013, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 5.24323609099548, "language_loss": 0.84125865, "learning_rate": 3.193020847610479e-06, "loss": 0.86316812, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.562692403793335 }, { "auxiliary_loss_clip": 0.01157786, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.05443454, "balance_loss_mlp": 1.02140319, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 2.080888321157409, "language_loss": 0.71308017, "learning_rate": 3.192395550340787e-06, "loss": 0.73496664, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 3.496281862258911 }, { "auxiliary_loss_clip": 0.01178285, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.05687571, "balance_loss_mlp": 1.02549767, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 2.018649976935453, "language_loss": 0.76772875, "learning_rate": 3.191770072185638e-06, "loss": 0.78984857, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 2.4758667945861816 }, { "auxiliary_loss_clip": 0.01178847, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.05875802, "balance_loss_mlp": 1.02428722, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 2.5372633830720703, "language_loss": 0.73069, "learning_rate": 3.191144413239916e-06, "loss": 0.7528044, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 2.4814114570617676 }, { "auxiliary_loss_clip": 0.01164879, "auxiliary_loss_mlp": 0.01036981, "balance_loss_clip": 1.05476785, "balance_loss_mlp": 1.02783799, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 1.9477532923758072, "language_loss": 0.8827486, "learning_rate": 3.190518573598534e-06, "loss": 0.90476716, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 3.4279568195343018 }, { "auxiliary_loss_clip": 0.01158273, "auxiliary_loss_mlp": 0.01034064, "balance_loss_clip": 1.05383265, "balance_loss_mlp": 1.02538562, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 1.5171096439817755, "language_loss": 0.7737484, "learning_rate": 3.1898925533564308e-06, "loss": 0.79567182, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.6136019229888916 }, { "auxiliary_loss_clip": 0.01137716, "auxiliary_loss_mlp": 0.01029777, "balance_loss_clip": 1.04845619, "balance_loss_mlp": 1.02066374, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 1.8613188658150552, "language_loss": 0.63824475, "learning_rate": 3.1892663526085733e-06, "loss": 0.65991968, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.559540033340454 }, { "auxiliary_loss_clip": 0.01081809, "auxiliary_loss_mlp": 0.01003627, "balance_loss_clip": 1.01885808, "balance_loss_mlp": 1.00200617, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.769466184298523, "language_loss": 0.56930357, "learning_rate": 3.188639971449956e-06, "loss": 0.59015793, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 3.057826519012451 }, { "auxiliary_loss_clip": 0.0119535, "auxiliary_loss_mlp": 0.01031263, "balance_loss_clip": 1.05960917, "balance_loss_mlp": 1.02302003, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 1.7832300950106799, "language_loss": 0.72373605, "learning_rate": 3.1880134099756e-06, "loss": 0.7460022, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.473712921142578 }, { "auxiliary_loss_clip": 0.01174526, "auxiliary_loss_mlp": 0.01031098, "balance_loss_clip": 1.05319715, "balance_loss_mlp": 1.02305746, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 1.9483728768602098, "language_loss": 0.69597852, "learning_rate": 3.1873866682805535e-06, "loss": 0.7180348, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.541693925857544 }, { "auxiliary_loss_clip": 0.01166766, "auxiliary_loss_mlp": 0.01030791, "balance_loss_clip": 1.05519795, "balance_loss_mlp": 1.02270222, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 1.8911633812860198, "language_loss": 0.88600028, "learning_rate": 3.186759746459894e-06, "loss": 0.90797579, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.5064198970794678 }, { "auxiliary_loss_clip": 0.01163276, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 1.05516267, "balance_loss_mlp": 1.02396452, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 1.8117284301261898, "language_loss": 0.79823989, "learning_rate": 3.1861326446087246e-06, "loss": 0.82019496, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.5550949573516846 }, { "auxiliary_loss_clip": 0.01182722, "auxiliary_loss_mlp": 0.01029753, "balance_loss_clip": 1.0578866, "balance_loss_mlp": 1.02137852, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 1.8996230243304466, "language_loss": 0.71730888, "learning_rate": 3.1855053628221763e-06, "loss": 0.73943359, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.5159761905670166 }, { "auxiliary_loss_clip": 0.0114049, "auxiliary_loss_mlp": 0.01035162, "balance_loss_clip": 1.04897881, "balance_loss_mlp": 1.02589965, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 2.379114194613131, "language_loss": 0.89773118, "learning_rate": 3.184877901195407e-06, "loss": 0.91948771, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.5219740867614746 }, { "auxiliary_loss_clip": 0.01065623, "auxiliary_loss_mlp": 0.01003226, "balance_loss_clip": 1.02396584, "balance_loss_mlp": 1.00133097, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.7948835507769505, "language_loss": 0.62837976, "learning_rate": 3.184250259823602e-06, "loss": 0.64906824, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.2015395164489746 }, { "auxiliary_loss_clip": 0.01153462, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.05340707, "balance_loss_mlp": 1.02280271, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 1.9561590052542814, "language_loss": 0.81286621, "learning_rate": 3.183622438801974e-06, "loss": 0.83471429, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.5572798252105713 }, { "auxiliary_loss_clip": 0.01197131, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.06114006, "balance_loss_mlp": 1.02530766, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 1.8431228324286453, "language_loss": 0.75488532, "learning_rate": 3.1829944382257637e-06, "loss": 0.77719378, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.4816553592681885 }, { "auxiliary_loss_clip": 0.01179223, "auxiliary_loss_mlp": 0.01029998, "balance_loss_clip": 1.0590024, "balance_loss_mlp": 1.02207112, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.1946276165519625, "language_loss": 0.81406397, "learning_rate": 3.1823662581902373e-06, "loss": 0.83615619, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.540379762649536 }, { "auxiliary_loss_clip": 0.01135013, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.04629803, "balance_loss_mlp": 1.02719676, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 2.005244393547803, "language_loss": 0.74909663, "learning_rate": 3.1817378987906896e-06, "loss": 0.7708022, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.582280397415161 }, { "auxiliary_loss_clip": 0.01133156, "auxiliary_loss_mlp": 0.01040223, "balance_loss_clip": 1.05480337, "balance_loss_mlp": 1.03211069, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 1.939811434699013, "language_loss": 0.80028826, "learning_rate": 3.181109360122442e-06, "loss": 0.82202208, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.578160524368286 }, { "auxiliary_loss_clip": 0.01146673, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.05174303, "balance_loss_mlp": 1.02455509, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 2.8415841755471622, "language_loss": 0.78306174, "learning_rate": 3.1804806422808445e-06, "loss": 0.80485934, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 2.5615053176879883 }, { "auxiliary_loss_clip": 0.01155654, "auxiliary_loss_mlp": 0.01034959, "balance_loss_clip": 1.05432642, "balance_loss_mlp": 1.02626288, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 1.6790661683635506, "language_loss": 0.72832751, "learning_rate": 3.1798517453612714e-06, "loss": 0.75023359, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 3.279682159423828 }, { "auxiliary_loss_clip": 0.01176888, "auxiliary_loss_mlp": 0.01036639, "balance_loss_clip": 1.05929756, "balance_loss_mlp": 1.02835989, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 2.1262285027409646, "language_loss": 0.74783993, "learning_rate": 3.1792226694591265e-06, "loss": 0.76997524, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 2.6269819736480713 }, { "auxiliary_loss_clip": 0.01149373, "auxiliary_loss_mlp": 0.01039119, "balance_loss_clip": 1.05503643, "balance_loss_mlp": 1.03132248, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 1.807630391158078, "language_loss": 0.80303264, "learning_rate": 3.178593414669841e-06, "loss": 0.82491755, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.590731143951416 }, { "auxiliary_loss_clip": 0.01180263, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.05708146, "balance_loss_mlp": 1.02688622, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 2.315232937448908, "language_loss": 0.7082088, "learning_rate": 3.1779639810888707e-06, "loss": 0.73037684, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 2.523946762084961 }, { "auxiliary_loss_clip": 0.01176751, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.05775309, "balance_loss_mlp": 1.02138281, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 1.8092318757780659, "language_loss": 0.75824231, "learning_rate": 3.1773343688117013e-06, "loss": 0.78030235, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.5497312545776367 }, { "auxiliary_loss_clip": 0.01166177, "auxiliary_loss_mlp": 0.00761484, "balance_loss_clip": 1.05307436, "balance_loss_mlp": 1.00031805, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 2.274709374364616, "language_loss": 0.84455216, "learning_rate": 3.1767045779338445e-06, "loss": 0.86382878, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.5442593097686768 }, { "auxiliary_loss_clip": 0.01176835, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.05360711, "balance_loss_mlp": 1.02522564, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 2.3339706861538345, "language_loss": 0.91385859, "learning_rate": 3.176074608550839e-06, "loss": 0.93596399, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 4.0234880447387695 }, { "auxiliary_loss_clip": 0.01125642, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.05138493, "balance_loss_mlp": 1.02958322, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 2.301230044446547, "language_loss": 0.82308203, "learning_rate": 3.17544446075825e-06, "loss": 0.84471881, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 2.6307272911071777 }, { "auxiliary_loss_clip": 0.01172142, "auxiliary_loss_mlp": 0.01032435, "balance_loss_clip": 1.0573349, "balance_loss_mlp": 1.02517533, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.6008794202942818, "language_loss": 0.71016449, "learning_rate": 3.174814134651671e-06, "loss": 0.73221034, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 2.6674296855926514 }, { "auxiliary_loss_clip": 0.01189318, "auxiliary_loss_mlp": 0.01034882, "balance_loss_clip": 1.05798197, "balance_loss_mlp": 1.02722287, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 1.73480445853841, "language_loss": 0.8050698, "learning_rate": 3.1741836303267215e-06, "loss": 0.82731175, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 3.3182523250579834 }, { "auxiliary_loss_clip": 0.01189596, "auxiliary_loss_mlp": 0.01031391, "balance_loss_clip": 1.05821872, "balance_loss_mlp": 1.02254534, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 1.9143589968022052, "language_loss": 0.75182116, "learning_rate": 3.1735529478790496e-06, "loss": 0.77403104, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 2.4377593994140625 }, { "auxiliary_loss_clip": 0.0117883, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.05787694, "balance_loss_mlp": 1.02237916, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 1.8540781847067014, "language_loss": 0.79164964, "learning_rate": 3.172922087404328e-06, "loss": 0.81374991, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 2.7556631565093994 }, { "auxiliary_loss_clip": 0.01079644, "auxiliary_loss_mlp": 0.01006971, "balance_loss_clip": 1.01718402, "balance_loss_mlp": 1.00530195, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.774204213820679, "language_loss": 0.55248016, "learning_rate": 3.1722910489982586e-06, "loss": 0.57334632, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 3.1508233547210693 }, { "auxiliary_loss_clip": 0.01157738, "auxiliary_loss_mlp": 0.010371, "balance_loss_clip": 1.05244756, "balance_loss_mlp": 1.02843308, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 1.5461703968423186, "language_loss": 0.80198348, "learning_rate": 3.1716598327565694e-06, "loss": 0.82393181, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.550035238265991 }, { "auxiliary_loss_clip": 0.01192439, "auxiliary_loss_mlp": 0.01027035, "balance_loss_clip": 1.05886412, "balance_loss_mlp": 1.01994836, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 1.4767632616792772, "language_loss": 0.84132409, "learning_rate": 3.171028438775015e-06, "loss": 0.86351877, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.463961601257324 }, { "auxiliary_loss_clip": 0.01193673, "auxiliary_loss_mlp": 0.01029074, "balance_loss_clip": 1.05921602, "balance_loss_mlp": 1.02109885, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 2.16058998437503, "language_loss": 0.84063214, "learning_rate": 3.170396867149377e-06, "loss": 0.86285961, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.4640145301818848 }, { "auxiliary_loss_clip": 0.01127343, "auxiliary_loss_mlp": 0.01037295, "balance_loss_clip": 1.05160975, "balance_loss_mlp": 1.02830625, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 1.6902532311491472, "language_loss": 0.86485612, "learning_rate": 3.1697651179754653e-06, "loss": 0.8865025, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.5881197452545166 }, { "auxiliary_loss_clip": 0.01151689, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.05696213, "balance_loss_mlp": 1.02455473, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 1.770196753166316, "language_loss": 0.73259848, "learning_rate": 3.1691331913491153e-06, "loss": 0.75444412, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.6227028369903564 }, { "auxiliary_loss_clip": 0.01189915, "auxiliary_loss_mlp": 0.01032205, "balance_loss_clip": 1.05653465, "balance_loss_mlp": 1.02427745, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 2.0405114950899867, "language_loss": 0.84846628, "learning_rate": 3.1685010873661898e-06, "loss": 0.87068748, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.4417991638183594 }, { "auxiliary_loss_clip": 0.01174669, "auxiliary_loss_mlp": 0.01032684, "balance_loss_clip": 1.05594146, "balance_loss_mlp": 1.02377272, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 2.0358809136965608, "language_loss": 0.79536301, "learning_rate": 3.167868806122578e-06, "loss": 0.81743658, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.509596347808838 }, { "auxiliary_loss_clip": 0.01168401, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.05699515, "balance_loss_mlp": 1.027156, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 2.06066875729796, "language_loss": 0.66108555, "learning_rate": 3.1672363477141968e-06, "loss": 0.68312651, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.59277606010437 }, { "auxiliary_loss_clip": 0.01167181, "auxiliary_loss_mlp": 0.01030846, "balance_loss_clip": 1.05432749, "balance_loss_mlp": 1.0221796, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 1.8488892103490198, "language_loss": 0.85220861, "learning_rate": 3.1666037122369903e-06, "loss": 0.87418884, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.6028285026550293 }, { "auxiliary_loss_clip": 0.01173479, "auxiliary_loss_mlp": 0.01030209, "balance_loss_clip": 1.05186534, "balance_loss_mlp": 1.02182901, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 1.8799019904958336, "language_loss": 0.86331713, "learning_rate": 3.165970899786928e-06, "loss": 0.88535404, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.467625379562378 }, { "auxiliary_loss_clip": 0.01148893, "auxiliary_loss_mlp": 0.01028768, "balance_loss_clip": 1.05100632, "balance_loss_mlp": 1.02082872, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 1.6796931237795973, "language_loss": 0.75592911, "learning_rate": 3.1653379104600067e-06, "loss": 0.77770573, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.5971837043762207 }, { "auxiliary_loss_clip": 0.0117496, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.05391312, "balance_loss_mlp": 1.02533627, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 1.4919727754951255, "language_loss": 0.69383216, "learning_rate": 3.164704744352251e-06, "loss": 0.7159152, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.5362489223480225 }, { "auxiliary_loss_clip": 0.01174164, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.05433273, "balance_loss_mlp": 1.02859366, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 1.802104443626859, "language_loss": 0.8097564, "learning_rate": 3.164071401559713e-06, "loss": 0.8318615, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 2.4820687770843506 }, { "auxiliary_loss_clip": 0.01164094, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.05433321, "balance_loss_mlp": 1.02716422, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 1.6977396890435756, "language_loss": 0.71448612, "learning_rate": 3.1634378821784674e-06, "loss": 0.7364831, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 3.2998459339141846 }, { "auxiliary_loss_clip": 0.01147915, "auxiliary_loss_mlp": 0.01027756, "balance_loss_clip": 1.05422831, "balance_loss_mlp": 1.01974535, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 2.2282647007153726, "language_loss": 0.73976028, "learning_rate": 3.1628041863046208e-06, "loss": 0.76151705, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 2.523242950439453 }, { "auxiliary_loss_clip": 0.01192856, "auxiliary_loss_mlp": 0.01031491, "balance_loss_clip": 1.05513275, "balance_loss_mlp": 1.02295589, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 2.0322075622204294, "language_loss": 0.91231847, "learning_rate": 3.162170314034304e-06, "loss": 0.93456185, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.4767720699310303 }, { "auxiliary_loss_clip": 0.01192774, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.05705285, "balance_loss_mlp": 1.02523649, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.5068799721655897, "language_loss": 0.80879593, "learning_rate": 3.1615362654636738e-06, "loss": 0.83107018, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.4932491779327393 }, { "auxiliary_loss_clip": 0.01143731, "auxiliary_loss_mlp": 0.01028186, "balance_loss_clip": 1.05539083, "balance_loss_mlp": 1.02082467, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.7703835091331566, "language_loss": 0.86959296, "learning_rate": 3.1609020406889163e-06, "loss": 0.89131212, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 2.5520403385162354 }, { "auxiliary_loss_clip": 0.01166215, "auxiliary_loss_mlp": 0.01036976, "balance_loss_clip": 1.05484378, "balance_loss_mlp": 1.02888799, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.6874284054941056, "language_loss": 0.8482362, "learning_rate": 3.1602676398062416e-06, "loss": 0.87026811, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.5032873153686523 }, { "auxiliary_loss_clip": 0.01176668, "auxiliary_loss_mlp": 0.01032511, "balance_loss_clip": 1.05738974, "balance_loss_mlp": 1.02423167, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 2.0368208302838315, "language_loss": 0.6113776, "learning_rate": 3.1596330629118886e-06, "loss": 0.6334694, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 3.3222272396087646 }, { "auxiliary_loss_clip": 0.01128769, "auxiliary_loss_mlp": 0.01029448, "balance_loss_clip": 1.05058968, "balance_loss_mlp": 1.02148533, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 2.0148952043331514, "language_loss": 0.7324667, "learning_rate": 3.1589983101021223e-06, "loss": 0.75404894, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 3.4463837146759033 }, { "auxiliary_loss_clip": 0.01161817, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 1.05300069, "balance_loss_mlp": 1.0230484, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 1.902221735412763, "language_loss": 0.84459406, "learning_rate": 3.1583633814732337e-06, "loss": 0.86652088, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 2.595353126525879 }, { "auxiliary_loss_clip": 0.01188433, "auxiliary_loss_mlp": 0.01033703, "balance_loss_clip": 1.05471683, "balance_loss_mlp": 1.02483368, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 3.13325621450345, "language_loss": 0.72047395, "learning_rate": 3.157728277121541e-06, "loss": 0.74269533, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 3.2836217880249023 }, { "auxiliary_loss_clip": 0.01189792, "auxiliary_loss_mlp": 0.01028426, "balance_loss_clip": 1.0548569, "balance_loss_mlp": 1.02047193, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 2.7706940251215526, "language_loss": 0.78712898, "learning_rate": 3.1570929971433897e-06, "loss": 0.80931121, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.425074338912964 }, { "auxiliary_loss_clip": 0.01178251, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.05926263, "balance_loss_mlp": 1.02528119, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 2.675091527204574, "language_loss": 0.83813697, "learning_rate": 3.1564575416351504e-06, "loss": 0.86025363, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 2.5393710136413574 }, { "auxiliary_loss_clip": 0.01192558, "auxiliary_loss_mlp": 0.0102911, "balance_loss_clip": 1.05829406, "balance_loss_mlp": 1.02050948, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 2.126879159121157, "language_loss": 0.74361873, "learning_rate": 3.155821910693221e-06, "loss": 0.7658354, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 2.4670562744140625 }, { "auxiliary_loss_clip": 0.01160577, "auxiliary_loss_mlp": 0.01031745, "balance_loss_clip": 1.05232048, "balance_loss_mlp": 1.02300727, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 1.5987712268787602, "language_loss": 0.85843623, "learning_rate": 3.1551861044140275e-06, "loss": 0.88035947, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.583242416381836 }, { "auxiliary_loss_clip": 0.01128653, "auxiliary_loss_mlp": 0.01030682, "balance_loss_clip": 1.05159807, "balance_loss_mlp": 1.02282584, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 1.7565730512338849, "language_loss": 0.77677548, "learning_rate": 3.15455012289402e-06, "loss": 0.79836887, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.6574344635009766 }, { "auxiliary_loss_clip": 0.01180142, "auxiliary_loss_mlp": 0.01027634, "balance_loss_clip": 1.05871403, "balance_loss_mlp": 1.01933062, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 2.9584641997873877, "language_loss": 0.84100384, "learning_rate": 3.153913966229677e-06, "loss": 0.86308157, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.572122097015381 }, { "auxiliary_loss_clip": 0.01077912, "auxiliary_loss_mlp": 0.01002846, "balance_loss_clip": 1.02524996, "balance_loss_mlp": 1.00115287, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.6462304245529202, "language_loss": 0.50295472, "learning_rate": 3.1532776345175027e-06, "loss": 0.52376229, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.101923704147339 }, { "auxiliary_loss_clip": 0.01188721, "auxiliary_loss_mlp": 0.01027797, "balance_loss_clip": 1.05635953, "balance_loss_mlp": 1.02018559, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 1.8580538008003864, "language_loss": 0.78358245, "learning_rate": 3.1526411278540285e-06, "loss": 0.80574763, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.4639458656311035 }, { "auxiliary_loss_clip": 0.01167913, "auxiliary_loss_mlp": 0.0103819, "balance_loss_clip": 1.05293274, "balance_loss_mlp": 1.02975011, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.387863359396133, "language_loss": 0.81136453, "learning_rate": 3.1520044463358116e-06, "loss": 0.83342552, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.6199350357055664 }, { "auxiliary_loss_clip": 0.01171478, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.05456603, "balance_loss_mlp": 1.02677011, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 1.5908859156961546, "language_loss": 0.80231822, "learning_rate": 3.151367590059436e-06, "loss": 0.82437509, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.5507938861846924 }, { "auxiliary_loss_clip": 0.0119257, "auxiliary_loss_mlp": 0.00761626, "balance_loss_clip": 1.05902362, "balance_loss_mlp": 1.00036347, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 4.49294439836967, "language_loss": 0.86844987, "learning_rate": 3.1507305591215117e-06, "loss": 0.88799185, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.4843313694000244 }, { "auxiliary_loss_clip": 0.01074781, "auxiliary_loss_mlp": 0.01002841, "balance_loss_clip": 1.02304006, "balance_loss_mlp": 1.00133348, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6713513865112859, "language_loss": 0.55770934, "learning_rate": 3.150093353618677e-06, "loss": 0.57848561, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.1566555500030518 }, { "auxiliary_loss_clip": 0.01178871, "auxiliary_loss_mlp": 0.01032146, "balance_loss_clip": 1.05354643, "balance_loss_mlp": 1.02409947, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 2.3493225824300845, "language_loss": 0.88633859, "learning_rate": 3.149455973647596e-06, "loss": 0.9084487, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.506105899810791 }, { "auxiliary_loss_clip": 0.01137927, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.04619646, "balance_loss_mlp": 1.02949452, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 1.7900351868850153, "language_loss": 0.77288747, "learning_rate": 3.1488184193049563e-06, "loss": 0.79464173, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.574157953262329 }, { "auxiliary_loss_clip": 0.01188959, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.05755162, "balance_loss_mlp": 1.02609205, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.5553554168469481, "language_loss": 0.720245, "learning_rate": 3.1481806906874767e-06, "loss": 0.74247271, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.5082011222839355 }, { "auxiliary_loss_clip": 0.01190309, "auxiliary_loss_mlp": 0.01029179, "balance_loss_clip": 1.05787659, "balance_loss_mlp": 1.02173471, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 1.6018333398601972, "language_loss": 0.87659931, "learning_rate": 3.147542787891899e-06, "loss": 0.89879417, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.4971280097961426 }, { "auxiliary_loss_clip": 0.01162566, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.05491936, "balance_loss_mlp": 1.02239585, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 1.9100232475315968, "language_loss": 0.75222838, "learning_rate": 3.1469047110149926e-06, "loss": 0.77416062, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 3.346435546875 }, { "auxiliary_loss_clip": 0.01131481, "auxiliary_loss_mlp": 0.01032543, "balance_loss_clip": 1.05448437, "balance_loss_mlp": 1.02425742, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 1.8524256847397533, "language_loss": 0.85097885, "learning_rate": 3.146266460153554e-06, "loss": 0.87261903, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 2.604234457015991 }, { "auxiliary_loss_clip": 0.01159569, "auxiliary_loss_mlp": 0.00762018, "balance_loss_clip": 1.05214119, "balance_loss_mlp": 1.00036168, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 1.5782940324467045, "language_loss": 0.79963642, "learning_rate": 3.145628035404404e-06, "loss": 0.81885231, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.5571608543395996 }, { "auxiliary_loss_clip": 0.0107023, "auxiliary_loss_mlp": 0.01003885, "balance_loss_clip": 1.01939845, "balance_loss_mlp": 1.00236464, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.9006603205711438, "language_loss": 0.57495886, "learning_rate": 3.1449894368643922e-06, "loss": 0.59570003, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.164618730545044 }, { "auxiliary_loss_clip": 0.01152454, "auxiliary_loss_mlp": 0.01040249, "balance_loss_clip": 1.05986309, "balance_loss_mlp": 1.03298354, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.4621272017518916, "language_loss": 0.71248221, "learning_rate": 3.1443506646303934e-06, "loss": 0.73440927, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 2.6308202743530273 }, { "auxiliary_loss_clip": 0.0117605, "auxiliary_loss_mlp": 0.01035333, "balance_loss_clip": 1.0531739, "balance_loss_mlp": 1.02765, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 7.353733216894235, "language_loss": 0.66784394, "learning_rate": 3.1437117187993086e-06, "loss": 0.6899578, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 2.607372283935547 }, { "auxiliary_loss_clip": 0.01139685, "auxiliary_loss_mlp": 0.01030025, "balance_loss_clip": 1.04875445, "balance_loss_mlp": 1.02206743, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.545054659690862, "language_loss": 0.79804105, "learning_rate": 3.143072599468065e-06, "loss": 0.81973815, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 4.12703275680542 }, { "auxiliary_loss_clip": 0.01162847, "auxiliary_loss_mlp": 0.01027879, "balance_loss_clip": 1.05552804, "balance_loss_mlp": 1.01939714, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 1.5793637624877568, "language_loss": 0.75895953, "learning_rate": 3.1424333067336174e-06, "loss": 0.7808668, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 2.686497926712036 }, { "auxiliary_loss_clip": 0.01179696, "auxiliary_loss_mlp": 0.01034878, "balance_loss_clip": 1.05328429, "balance_loss_mlp": 1.02637863, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 1.7647835417419477, "language_loss": 0.7811048, "learning_rate": 3.141793840692945e-06, "loss": 0.80325061, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 2.572366237640381 }, { "auxiliary_loss_clip": 0.01151558, "auxiliary_loss_mlp": 0.01032514, "balance_loss_clip": 1.05204642, "balance_loss_mlp": 1.02399063, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 1.9916535411542797, "language_loss": 0.61171758, "learning_rate": 3.1411542014430553e-06, "loss": 0.63355827, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 3.474208354949951 }, { "auxiliary_loss_clip": 0.0114448, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.04724705, "balance_loss_mlp": 1.03109264, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 1.6531987056918394, "language_loss": 0.8197487, "learning_rate": 3.1405143890809804e-06, "loss": 0.84158623, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.5931005477905273 }, { "auxiliary_loss_clip": 0.01158684, "auxiliary_loss_mlp": 0.01032809, "balance_loss_clip": 1.05203819, "balance_loss_mlp": 1.02494144, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 1.7158991914535384, "language_loss": 0.69822943, "learning_rate": 3.1398744037037796e-06, "loss": 0.72014439, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.561450242996216 }, { "auxiliary_loss_clip": 0.01164304, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.05624008, "balance_loss_mlp": 1.02220619, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 3.6370374515952517, "language_loss": 0.8450945, "learning_rate": 3.139234245408538e-06, "loss": 0.86704111, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 2.564940929412842 }, { "auxiliary_loss_clip": 0.01150741, "auxiliary_loss_mlp": 0.00762283, "balance_loss_clip": 1.05496395, "balance_loss_mlp": 1.00031722, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.3269052366302796, "language_loss": 0.76134205, "learning_rate": 3.1385939142923666e-06, "loss": 0.78047234, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.6398210525512695 }, { "auxiliary_loss_clip": 0.0116201, "auxiliary_loss_mlp": 0.01033609, "balance_loss_clip": 1.05210423, "balance_loss_mlp": 1.02469182, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 1.9369738584059697, "language_loss": 0.78389728, "learning_rate": 3.137953410452405e-06, "loss": 0.80585349, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.6131255626678467 }, { "auxiliary_loss_clip": 0.01156224, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.04875994, "balance_loss_mlp": 1.02414906, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 2.531757288333698, "language_loss": 0.74834931, "learning_rate": 3.1373127339858146e-06, "loss": 0.77023762, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.6980152130126953 }, { "auxiliary_loss_clip": 0.01141139, "auxiliary_loss_mlp": 0.01030016, "balance_loss_clip": 1.04927754, "balance_loss_mlp": 1.02257156, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 1.7453414053558873, "language_loss": 0.7479049, "learning_rate": 3.136671884989787e-06, "loss": 0.76961637, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.655062675476074 }, { "auxiliary_loss_clip": 0.01122058, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.04897451, "balance_loss_mlp": 1.02740002, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 2.649306392081033, "language_loss": 0.86878198, "learning_rate": 3.1360308635615383e-06, "loss": 0.89036274, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.6797726154327393 }, { "auxiliary_loss_clip": 0.01169531, "auxiliary_loss_mlp": 0.01028369, "balance_loss_clip": 1.05489635, "balance_loss_mlp": 1.01908255, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 1.8635316579138768, "language_loss": 0.78682673, "learning_rate": 3.135389669798311e-06, "loss": 0.8088057, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.6584837436676025 }, { "auxiliary_loss_clip": 0.01174309, "auxiliary_loss_mlp": 0.00761707, "balance_loss_clip": 1.05413806, "balance_loss_mlp": 1.000278, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 2.1597863399826673, "language_loss": 0.80061519, "learning_rate": 3.134748303797373e-06, "loss": 0.81997532, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.5492138862609863 }, { "auxiliary_loss_clip": 0.01131924, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.04943109, "balance_loss_mlp": 1.02431774, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 1.7768059144153656, "language_loss": 0.8112843, "learning_rate": 3.1341067656560203e-06, "loss": 0.8329401, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.666851282119751 }, { "auxiliary_loss_clip": 0.01168043, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.05197322, "balance_loss_mlp": 1.02760649, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 2.2640664974989195, "language_loss": 0.86291277, "learning_rate": 3.133465055471572e-06, "loss": 0.88494945, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.5866215229034424 }, { "auxiliary_loss_clip": 0.01142905, "auxiliary_loss_mlp": 0.01032043, "balance_loss_clip": 1.05273509, "balance_loss_mlp": 1.02386487, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 2.210843071015765, "language_loss": 0.66612774, "learning_rate": 3.1328231733413767e-06, "loss": 0.68787718, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.6438491344451904 }, { "auxiliary_loss_clip": 0.01172249, "auxiliary_loss_mlp": 0.01027752, "balance_loss_clip": 1.05476451, "balance_loss_mlp": 1.0195682, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 2.206875015761754, "language_loss": 0.90782261, "learning_rate": 3.1321811193628067e-06, "loss": 0.92982256, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.533637046813965 }, { "auxiliary_loss_clip": 0.01175697, "auxiliary_loss_mlp": 0.00762798, "balance_loss_clip": 1.05777073, "balance_loss_mlp": 1.00032806, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 1.9606160401906019, "language_loss": 0.70246625, "learning_rate": 3.131538893633261e-06, "loss": 0.72185123, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.6229028701782227 }, { "auxiliary_loss_clip": 0.01192844, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.05827308, "balance_loss_mlp": 1.02590632, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 2.1682453427527872, "language_loss": 0.78371221, "learning_rate": 3.130896496250165e-06, "loss": 0.80597663, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 2.5483930110931396 }, { "auxiliary_loss_clip": 0.01189709, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.05564237, "balance_loss_mlp": 1.02246869, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 2.0467908361598077, "language_loss": 0.86837316, "learning_rate": 3.1302539273109693e-06, "loss": 0.89057952, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 3.2041032314300537 }, { "auxiliary_loss_clip": 0.01157412, "auxiliary_loss_mlp": 0.0103531, "balance_loss_clip": 1.05510879, "balance_loss_mlp": 1.02701867, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 1.6788229921084543, "language_loss": 0.80675042, "learning_rate": 3.1296111869131513e-06, "loss": 0.82867765, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 2.6136317253112793 }, { "auxiliary_loss_clip": 0.0118984, "auxiliary_loss_mlp": 0.01028374, "balance_loss_clip": 1.05642319, "balance_loss_mlp": 1.02050614, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 1.7123549393077075, "language_loss": 0.85732198, "learning_rate": 3.1289682751542153e-06, "loss": 0.87950408, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.5157485008239746 }, { "auxiliary_loss_clip": 0.01174111, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.0534749, "balance_loss_mlp": 1.03169477, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 2.003460012113146, "language_loss": 0.71528089, "learning_rate": 3.1283251921316883e-06, "loss": 0.73742139, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.5308997631073 }, { "auxiliary_loss_clip": 0.0113307, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.05424511, "balance_loss_mlp": 1.02179599, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 1.8534293225606333, "language_loss": 0.8071481, "learning_rate": 3.1276819379431277e-06, "loss": 0.82877779, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 2.6399288177490234 }, { "auxiliary_loss_clip": 0.01169652, "auxiliary_loss_mlp": 0.00761677, "balance_loss_clip": 1.05673838, "balance_loss_mlp": 1.00027359, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.474496866471333, "language_loss": 0.7509402, "learning_rate": 3.1270385126861134e-06, "loss": 0.77025348, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.574673652648926 }, { "auxiliary_loss_clip": 0.01191952, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.05723381, "balance_loss_mlp": 1.02790701, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 1.7894008463564421, "language_loss": 0.81990361, "learning_rate": 3.1263949164582533e-06, "loss": 0.84218198, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 3.983107328414917 }, { "auxiliary_loss_clip": 0.01190331, "auxiliary_loss_mlp": 0.01029727, "balance_loss_clip": 1.05483842, "balance_loss_mlp": 1.0211972, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 1.9480111981355575, "language_loss": 0.77727699, "learning_rate": 3.1257511493571797e-06, "loss": 0.79947758, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 2.5156633853912354 }, { "auxiliary_loss_clip": 0.01150365, "auxiliary_loss_mlp": 0.01033398, "balance_loss_clip": 1.05326605, "balance_loss_mlp": 1.02541685, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 1.6932416676096262, "language_loss": 0.7828669, "learning_rate": 3.125107211480552e-06, "loss": 0.80470455, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 3.5037143230438232 }, { "auxiliary_loss_clip": 0.01117597, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.04909766, "balance_loss_mlp": 1.02458572, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.676027627611513, "language_loss": 0.79965723, "learning_rate": 3.124463102926054e-06, "loss": 0.82115769, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 2.6969716548919678 }, { "auxiliary_loss_clip": 0.01072794, "auxiliary_loss_mlp": 0.0100307, "balance_loss_clip": 1.02583826, "balance_loss_mlp": 1.00165689, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.7627786915019239, "language_loss": 0.61694014, "learning_rate": 3.1238188237913984e-06, "loss": 0.63769877, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.222346305847168 }, { "auxiliary_loss_clip": 0.01200296, "auxiliary_loss_mlp": 0.01035768, "balance_loss_clip": 1.06190431, "balance_loss_mlp": 1.02710748, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 1.9444929296606603, "language_loss": 0.76296949, "learning_rate": 3.1231743741743202e-06, "loss": 0.78533006, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.534473180770874 }, { "auxiliary_loss_clip": 0.01171874, "auxiliary_loss_mlp": 0.01032142, "balance_loss_clip": 1.05313396, "balance_loss_mlp": 1.02370238, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 2.1359025044125475, "language_loss": 0.83574426, "learning_rate": 3.122529754172582e-06, "loss": 0.85778439, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 2.526928186416626 }, { "auxiliary_loss_clip": 0.0117828, "auxiliary_loss_mlp": 0.01031137, "balance_loss_clip": 1.05664873, "balance_loss_mlp": 1.02276897, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 1.7762631572684908, "language_loss": 0.72551751, "learning_rate": 3.1218849638839736e-06, "loss": 0.74761164, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.5743556022644043 }, { "auxiliary_loss_clip": 0.01135986, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.04770517, "balance_loss_mlp": 1.02297211, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 1.808933091809179, "language_loss": 0.78529561, "learning_rate": 3.121240003406307e-06, "loss": 0.80697513, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.608844518661499 }, { "auxiliary_loss_clip": 0.01153643, "auxiliary_loss_mlp": 0.01030758, "balance_loss_clip": 1.055722, "balance_loss_mlp": 1.02233613, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 1.8143974951055173, "language_loss": 0.72303003, "learning_rate": 3.120594872837425e-06, "loss": 0.74487412, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.6767666339874268 }, { "auxiliary_loss_clip": 0.01067696, "auxiliary_loss_mlp": 0.00752422, "balance_loss_clip": 1.01838791, "balance_loss_mlp": 1.000458, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.8308167306280487, "language_loss": 0.62396312, "learning_rate": 3.1199495722751906e-06, "loss": 0.64216429, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.151993989944458 }, { "auxiliary_loss_clip": 0.01137065, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.04964137, "balance_loss_mlp": 1.02507937, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 2.7570209205688605, "language_loss": 0.83463514, "learning_rate": 3.1193041018174972e-06, "loss": 0.85633844, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.654470443725586 }, { "auxiliary_loss_clip": 0.01181138, "auxiliary_loss_mlp": 0.01038067, "balance_loss_clip": 1.05776823, "balance_loss_mlp": 1.029037, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 1.9562257207840053, "language_loss": 0.94763362, "learning_rate": 3.118658461562261e-06, "loss": 0.96982563, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.557013750076294 }, { "auxiliary_loss_clip": 0.01163437, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.05728507, "balance_loss_mlp": 1.02503228, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 1.3694378427038796, "language_loss": 0.84908628, "learning_rate": 3.118012651607426e-06, "loss": 0.8710587, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.610518217086792 }, { "auxiliary_loss_clip": 0.01189626, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.05629873, "balance_loss_mlp": 1.02120996, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 2.3366222055349217, "language_loss": 0.83406103, "learning_rate": 3.1173666720509603e-06, "loss": 0.85625398, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.464991569519043 }, { "auxiliary_loss_clip": 0.01167727, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.05404973, "balance_loss_mlp": 1.02206743, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 1.704019387385593, "language_loss": 0.68158388, "learning_rate": 3.116720522990859e-06, "loss": 0.70356685, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.639462471008301 }, { "auxiliary_loss_clip": 0.01120063, "auxiliary_loss_mlp": 0.01034166, "balance_loss_clip": 1.05033457, "balance_loss_mlp": 1.02556539, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 1.732106871289698, "language_loss": 0.62254059, "learning_rate": 3.116074204525142e-06, "loss": 0.6440829, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.645892858505249 }, { "auxiliary_loss_clip": 0.01167822, "auxiliary_loss_mlp": 0.01032664, "balance_loss_clip": 1.05402088, "balance_loss_mlp": 1.02388406, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 1.4403111684202343, "language_loss": 0.83469957, "learning_rate": 3.1154277167518553e-06, "loss": 0.85670441, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.632929563522339 }, { "auxiliary_loss_clip": 0.01051608, "auxiliary_loss_mlp": 0.01008725, "balance_loss_clip": 1.01588202, "balance_loss_mlp": 1.00716352, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7806237321066729, "language_loss": 0.59484404, "learning_rate": 3.114781059769072e-06, "loss": 0.61544734, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 2.9860823154449463 }, { "auxiliary_loss_clip": 0.01162701, "auxiliary_loss_mlp": 0.01030087, "balance_loss_clip": 1.05218077, "balance_loss_mlp": 1.0213666, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 3.886624193414405, "language_loss": 0.67224443, "learning_rate": 3.1141342336748874e-06, "loss": 0.69417232, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.6145172119140625 }, { "auxiliary_loss_clip": 0.01177613, "auxiliary_loss_mlp": 0.01033033, "balance_loss_clip": 1.05881524, "balance_loss_mlp": 1.02517748, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.4338403622526639, "language_loss": 0.82168186, "learning_rate": 3.1134872385674253e-06, "loss": 0.84378827, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 3.3863301277160645 }, { "auxiliary_loss_clip": 0.01163542, "auxiliary_loss_mlp": 0.01028764, "balance_loss_clip": 1.0504458, "balance_loss_mlp": 1.02086878, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 1.801009305326311, "language_loss": 0.85479379, "learning_rate": 3.1128400745448353e-06, "loss": 0.87671679, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 2.5556745529174805 }, { "auxiliary_loss_clip": 0.0117963, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.05656183, "balance_loss_mlp": 1.02527726, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.618110291672558, "language_loss": 0.629264, "learning_rate": 3.11219274170529e-06, "loss": 0.65139341, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.648175001144409 }, { "auxiliary_loss_clip": 0.0115509, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.05046296, "balance_loss_mlp": 1.02582097, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 1.875694110402851, "language_loss": 0.81791508, "learning_rate": 3.1115452401469903e-06, "loss": 0.83980143, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.6086814403533936 }, { "auxiliary_loss_clip": 0.01122061, "auxiliary_loss_mlp": 0.01031263, "balance_loss_clip": 1.04608107, "balance_loss_mlp": 1.02384245, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 2.0117282403765464, "language_loss": 0.86488992, "learning_rate": 3.1108975699681613e-06, "loss": 0.88642317, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 2.6085333824157715 }, { "auxiliary_loss_clip": 0.01147027, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.05264485, "balance_loss_mlp": 1.02839684, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 2.010846690736538, "language_loss": 0.71842688, "learning_rate": 3.1102497312670542e-06, "loss": 0.74025643, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 3.3850274085998535 }, { "auxiliary_loss_clip": 0.01151412, "auxiliary_loss_mlp": 0.0102869, "balance_loss_clip": 1.05319405, "balance_loss_mlp": 1.02075684, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 2.016638957595795, "language_loss": 0.80477053, "learning_rate": 3.109601724141946e-06, "loss": 0.82657152, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 3.3427720069885254 }, { "auxiliary_loss_clip": 0.01156323, "auxiliary_loss_mlp": 0.01028335, "balance_loss_clip": 1.05165076, "balance_loss_mlp": 1.02002597, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 1.6426050152662954, "language_loss": 0.67951334, "learning_rate": 3.108953548691138e-06, "loss": 0.70135987, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 2.5613765716552734 }, { "auxiliary_loss_clip": 0.0119178, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.05712712, "balance_loss_mlp": 1.02036762, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 2.1292685985883097, "language_loss": 0.72697675, "learning_rate": 3.108305205012959e-06, "loss": 0.7491827, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 3.4604666233062744 }, { "auxiliary_loss_clip": 0.0116306, "auxiliary_loss_mlp": 0.01028951, "balance_loss_clip": 1.05439198, "balance_loss_mlp": 1.02104211, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 2.293062378267243, "language_loss": 0.87311637, "learning_rate": 3.107656693205761e-06, "loss": 0.89503646, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.616868019104004 }, { "auxiliary_loss_clip": 0.01196814, "auxiliary_loss_mlp": 0.01033485, "balance_loss_clip": 1.05877006, "balance_loss_mlp": 1.02416921, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 4.014427190952157, "language_loss": 0.70470929, "learning_rate": 3.107008013367924e-06, "loss": 0.72701228, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.5142111778259277 }, { "auxiliary_loss_clip": 0.01146123, "auxiliary_loss_mlp": 0.01028223, "balance_loss_clip": 1.0510112, "balance_loss_mlp": 1.02070725, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 1.8741735741192642, "language_loss": 0.86698985, "learning_rate": 3.1063591655978507e-06, "loss": 0.88873327, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 2.581897497177124 }, { "auxiliary_loss_clip": 0.01120684, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.04538083, "balance_loss_mlp": 1.01946676, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 1.8599079760513766, "language_loss": 0.79472232, "learning_rate": 3.105710149993972e-06, "loss": 0.81620663, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 2.6028857231140137 }, { "auxiliary_loss_clip": 0.01191275, "auxiliary_loss_mlp": 0.01036671, "balance_loss_clip": 1.05697846, "balance_loss_mlp": 1.0284512, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 2.075112232561446, "language_loss": 0.8538534, "learning_rate": 3.1050609666547427e-06, "loss": 0.87613285, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 2.5111191272735596 }, { "auxiliary_loss_clip": 0.01154694, "auxiliary_loss_mlp": 0.01038985, "balance_loss_clip": 1.0551194, "balance_loss_mlp": 1.03074145, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 1.9617442772195952, "language_loss": 0.77440172, "learning_rate": 3.104411615678644e-06, "loss": 0.7963385, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.5984508991241455 }, { "auxiliary_loss_clip": 0.0115671, "auxiliary_loss_mlp": 0.01041678, "balance_loss_clip": 1.05235517, "balance_loss_mlp": 1.03285027, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 7.865890284207227, "language_loss": 0.73277342, "learning_rate": 3.1037620971641803e-06, "loss": 0.75475729, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.585015296936035 }, { "auxiliary_loss_clip": 0.0119025, "auxiliary_loss_mlp": 0.01037698, "balance_loss_clip": 1.05544424, "balance_loss_mlp": 1.0283227, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 2.984575702655751, "language_loss": 0.6515348, "learning_rate": 3.1031124112098844e-06, "loss": 0.6738143, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.4900197982788086 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.01033239, "balance_loss_clip": 1.05442941, "balance_loss_mlp": 1.02484632, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 1.8656066285759203, "language_loss": 0.72581875, "learning_rate": 3.1024625579143127e-06, "loss": 0.74781722, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.560081720352173 }, { "auxiliary_loss_clip": 0.01191043, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 1.05780494, "balance_loss_mlp": 1.02262604, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 1.9874618109270874, "language_loss": 0.732072, "learning_rate": 3.101812537376048e-06, "loss": 0.7542901, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.4848573207855225 }, { "auxiliary_loss_clip": 0.01152986, "auxiliary_loss_mlp": 0.00761565, "balance_loss_clip": 1.0525943, "balance_loss_mlp": 1.00025129, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 2.022043152550842, "language_loss": 0.83934689, "learning_rate": 3.1011623496936973e-06, "loss": 0.85849237, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.5965471267700195 }, { "auxiliary_loss_clip": 0.01188979, "auxiliary_loss_mlp": 0.01033154, "balance_loss_clip": 1.05653226, "balance_loss_mlp": 1.02538157, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 2.697073426846691, "language_loss": 0.69428551, "learning_rate": 3.100511994965893e-06, "loss": 0.71650684, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.531032085418701 }, { "auxiliary_loss_clip": 0.01166323, "auxiliary_loss_mlp": 0.01039735, "balance_loss_clip": 1.05203164, "balance_loss_mlp": 1.03142571, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.6062580179427184, "language_loss": 0.8428483, "learning_rate": 3.0998614732912947e-06, "loss": 0.86490887, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.546074151992798 }, { "auxiliary_loss_clip": 0.01174853, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.05619121, "balance_loss_mlp": 1.02389288, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 2.227338128291741, "language_loss": 0.68426156, "learning_rate": 3.0992107847685855e-06, "loss": 0.70633876, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.472705125808716 }, { "auxiliary_loss_clip": 0.01164388, "auxiliary_loss_mlp": 0.01038141, "balance_loss_clip": 1.0572772, "balance_loss_mlp": 1.02976644, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.7158237031351875, "language_loss": 0.79667324, "learning_rate": 3.0985599294964736e-06, "loss": 0.81869847, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.579221248626709 }, { "auxiliary_loss_clip": 0.01157669, "auxiliary_loss_mlp": 0.01037215, "balance_loss_clip": 1.05213571, "balance_loss_mlp": 1.02877474, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 1.8772414419336398, "language_loss": 0.6990332, "learning_rate": 3.097908907573695e-06, "loss": 0.72098207, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.62852144241333 }, { "auxiliary_loss_clip": 0.01121277, "auxiliary_loss_mlp": 0.01035459, "balance_loss_clip": 1.05153251, "balance_loss_mlp": 1.02746582, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 1.9235489123786909, "language_loss": 0.89623404, "learning_rate": 3.0972577190990067e-06, "loss": 0.91780138, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 2.6461944580078125 }, { "auxiliary_loss_clip": 0.01151419, "auxiliary_loss_mlp": 0.01030589, "balance_loss_clip": 1.05165362, "balance_loss_mlp": 1.02253628, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.8340833430337022, "language_loss": 0.79663914, "learning_rate": 3.096606364171196e-06, "loss": 0.81845927, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 3.3135764598846436 }, { "auxiliary_loss_clip": 0.01132791, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.04867601, "balance_loss_mlp": 1.02207875, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 1.7378180038215276, "language_loss": 0.84903878, "learning_rate": 3.0959548428890703e-06, "loss": 0.87066722, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.5777885913848877 }, { "auxiliary_loss_clip": 0.01175729, "auxiliary_loss_mlp": 0.01032814, "balance_loss_clip": 1.05611241, "balance_loss_mlp": 1.02458274, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 1.8117735820156302, "language_loss": 0.83922189, "learning_rate": 3.095303155351468e-06, "loss": 0.86130732, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 2.5712294578552246 }, { "auxiliary_loss_clip": 0.01125677, "auxiliary_loss_mlp": 0.0103854, "balance_loss_clip": 1.05122256, "balance_loss_mlp": 1.02967715, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.1665663620046787, "language_loss": 0.79349971, "learning_rate": 3.0946513016572464e-06, "loss": 0.81514192, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 2.638458728790283 }, { "auxiliary_loss_clip": 0.01176085, "auxiliary_loss_mlp": 0.01033815, "balance_loss_clip": 1.0537467, "balance_loss_mlp": 1.02448738, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 1.8667049658915744, "language_loss": 0.77005142, "learning_rate": 3.0939992819052938e-06, "loss": 0.79215038, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 2.48612380027771 }, { "auxiliary_loss_clip": 0.0116314, "auxiliary_loss_mlp": 0.01032059, "balance_loss_clip": 1.05588043, "balance_loss_mlp": 1.02307034, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 1.8498968668099007, "language_loss": 0.80902696, "learning_rate": 3.0933470961945193e-06, "loss": 0.83097899, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 3.356311798095703 }, { "auxiliary_loss_clip": 0.01158257, "auxiliary_loss_mlp": 0.01038165, "balance_loss_clip": 1.05419207, "balance_loss_mlp": 1.02962947, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 1.6774446358835056, "language_loss": 0.68206668, "learning_rate": 3.0926947446238597e-06, "loss": 0.70403087, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 3.313974380493164 }, { "auxiliary_loss_clip": 0.01178751, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.05204892, "balance_loss_mlp": 1.0184114, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 2.3878338157915295, "language_loss": 0.82329828, "learning_rate": 3.092042227292276e-06, "loss": 0.84535629, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 2.5129904747009277 }, { "auxiliary_loss_clip": 0.01189863, "auxiliary_loss_mlp": 0.01036857, "balance_loss_clip": 1.05771589, "balance_loss_mlp": 1.02843499, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.5429127966283056, "language_loss": 0.87740183, "learning_rate": 3.0913895442987557e-06, "loss": 0.89966899, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 3.3553528785705566 }, { "auxiliary_loss_clip": 0.01146949, "auxiliary_loss_mlp": 0.00761997, "balance_loss_clip": 1.05314827, "balance_loss_mlp": 1.0002563, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 1.5694243333978366, "language_loss": 0.85833454, "learning_rate": 3.090736695742308e-06, "loss": 0.877424, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.6145522594451904 }, { "auxiliary_loss_clip": 0.01129569, "auxiliary_loss_mlp": 0.0103306, "balance_loss_clip": 1.05161941, "balance_loss_mlp": 1.02509046, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.2847277011876828, "language_loss": 0.52509338, "learning_rate": 3.0900836817219713e-06, "loss": 0.54671967, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.6026904582977295 }, { "auxiliary_loss_clip": 0.01190892, "auxiliary_loss_mlp": 0.01032457, "balance_loss_clip": 1.05671525, "balance_loss_mlp": 1.02435076, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 1.5846024561861811, "language_loss": 0.83552849, "learning_rate": 3.089430502336807e-06, "loss": 0.85776198, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 2.495375156402588 }, { "auxiliary_loss_clip": 0.01181283, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.05780101, "balance_loss_mlp": 1.02791071, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 3.8361396244985366, "language_loss": 0.90315086, "learning_rate": 3.088777157685902e-06, "loss": 0.92533046, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 2.485994338989258 }, { "auxiliary_loss_clip": 0.01160191, "auxiliary_loss_mlp": 0.01027439, "balance_loss_clip": 1.05477929, "balance_loss_mlp": 1.01983976, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 2.0103006718874736, "language_loss": 0.85996425, "learning_rate": 3.088123647868367e-06, "loss": 0.88184059, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.5338659286499023 }, { "auxiliary_loss_clip": 0.01181153, "auxiliary_loss_mlp": 0.01030724, "balance_loss_clip": 1.05569839, "balance_loss_mlp": 1.02209377, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 1.8241599806688602, "language_loss": 0.81265974, "learning_rate": 3.0874699729833405e-06, "loss": 0.83477843, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.5975427627563477 }, { "auxiliary_loss_clip": 0.01157675, "auxiliary_loss_mlp": 0.01027428, "balance_loss_clip": 1.0527966, "balance_loss_mlp": 1.01934564, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 1.568958079945656, "language_loss": 0.79569912, "learning_rate": 3.086816133129983e-06, "loss": 0.81755012, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.605307102203369 }, { "auxiliary_loss_clip": 0.01194462, "auxiliary_loss_mlp": 0.01035035, "balance_loss_clip": 1.05949771, "balance_loss_mlp": 1.02713132, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 1.923238008300245, "language_loss": 0.76340878, "learning_rate": 3.0861621284074826e-06, "loss": 0.78570378, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.5336546897888184 }, { "auxiliary_loss_clip": 0.01170634, "auxiliary_loss_mlp": 0.01038935, "balance_loss_clip": 1.05683911, "balance_loss_mlp": 1.03134727, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.8511064054458861, "language_loss": 0.7292335, "learning_rate": 3.085507958915051e-06, "loss": 0.75132918, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.5961081981658936 }, { "auxiliary_loss_clip": 0.01163064, "auxiliary_loss_mlp": 0.0103852, "balance_loss_clip": 1.05676854, "balance_loss_mlp": 1.02937102, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 1.9966346595225377, "language_loss": 0.71038234, "learning_rate": 3.084853624751925e-06, "loss": 0.73239821, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 2.7424263954162598 }, { "auxiliary_loss_clip": 0.01151478, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.05503249, "balance_loss_mlp": 1.01865566, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 1.6389833589842036, "language_loss": 0.8572036, "learning_rate": 3.0841991260173668e-06, "loss": 0.8789922, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.636681318283081 }, { "auxiliary_loss_clip": 0.01193057, "auxiliary_loss_mlp": 0.01032131, "balance_loss_clip": 1.05889618, "balance_loss_mlp": 1.02369142, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 1.8027076801080775, "language_loss": 0.80156368, "learning_rate": 3.0835444628106634e-06, "loss": 0.82381558, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.4759881496429443 }, { "auxiliary_loss_clip": 0.01192352, "auxiliary_loss_mlp": 0.00761946, "balance_loss_clip": 1.05862045, "balance_loss_mlp": 1.00026762, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.872294907964951, "language_loss": 0.8309536, "learning_rate": 3.082889635231126e-06, "loss": 0.85049653, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.5127387046813965 }, { "auxiliary_loss_clip": 0.01164526, "auxiliary_loss_mlp": 0.01041515, "balance_loss_clip": 1.05288625, "balance_loss_mlp": 1.03273559, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 2.187105608582856, "language_loss": 0.76437581, "learning_rate": 3.0822346433780925e-06, "loss": 0.78643626, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.597376585006714 }, { "auxiliary_loss_clip": 0.01174435, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.05118477, "balance_loss_mlp": 1.02583754, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 2.727904837231005, "language_loss": 0.87216246, "learning_rate": 3.0815794873509237e-06, "loss": 0.89425397, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.5557913780212402 }, { "auxiliary_loss_clip": 0.01191807, "auxiliary_loss_mlp": 0.01031581, "balance_loss_clip": 1.05713618, "balance_loss_mlp": 1.02314091, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 1.9449016605541014, "language_loss": 0.72560668, "learning_rate": 3.0809241672490066e-06, "loss": 0.74784064, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.4725780487060547 }, { "auxiliary_loss_clip": 0.01164464, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.05596066, "balance_loss_mlp": 1.02420366, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 1.7563921586113573, "language_loss": 0.85033894, "learning_rate": 3.080268683171753e-06, "loss": 0.87230659, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 2.5601589679718018 }, { "auxiliary_loss_clip": 0.01176963, "auxiliary_loss_mlp": 0.01030315, "balance_loss_clip": 1.05374336, "balance_loss_mlp": 1.02178597, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 2.011754210651904, "language_loss": 0.89363098, "learning_rate": 3.0796130352185985e-06, "loss": 0.91570377, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 3.278252601623535 }, { "auxiliary_loss_clip": 0.011513, "auxiliary_loss_mlp": 0.00762806, "balance_loss_clip": 1.05013561, "balance_loss_mlp": 1.00028777, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 1.7191188425091566, "language_loss": 0.66406536, "learning_rate": 3.0789572234890057e-06, "loss": 0.68320644, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.6657886505126953 }, { "auxiliary_loss_clip": 0.01161132, "auxiliary_loss_mlp": 0.01033412, "balance_loss_clip": 1.05485559, "balance_loss_mlp": 1.02496564, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 1.5860837649000734, "language_loss": 0.77403164, "learning_rate": 3.0783012480824596e-06, "loss": 0.79597712, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.512835741043091 }, { "auxiliary_loss_clip": 0.01191022, "auxiliary_loss_mlp": 0.01040228, "balance_loss_clip": 1.05701792, "balance_loss_mlp": 1.0314095, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 2.4135441818121617, "language_loss": 0.7426461, "learning_rate": 3.077645109098471e-06, "loss": 0.76495862, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 2.4528794288635254 }, { "auxiliary_loss_clip": 0.01136201, "auxiliary_loss_mlp": 0.01030761, "balance_loss_clip": 1.05002451, "balance_loss_mlp": 1.02241611, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 1.6545927537226381, "language_loss": 0.72225451, "learning_rate": 3.076988806636577e-06, "loss": 0.74392414, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 2.564561605453491 }, { "auxiliary_loss_clip": 0.01166071, "auxiliary_loss_mlp": 0.00762682, "balance_loss_clip": 1.05588567, "balance_loss_mlp": 1.00028861, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 1.8257072180149934, "language_loss": 0.88669872, "learning_rate": 3.0763323407963377e-06, "loss": 0.90598625, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 3.3646271228790283 }, { "auxiliary_loss_clip": 0.01175345, "auxiliary_loss_mlp": 0.01030599, "balance_loss_clip": 1.053478, "balance_loss_mlp": 1.02217054, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 2.198216914701041, "language_loss": 0.80240387, "learning_rate": 3.075675711677337e-06, "loss": 0.82446325, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 3.29198956489563 }, { "auxiliary_loss_clip": 0.01157058, "auxiliary_loss_mlp": 0.01036488, "balance_loss_clip": 1.05518126, "balance_loss_mlp": 1.02806616, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 1.8007682715217816, "language_loss": 0.78273833, "learning_rate": 3.0750189193791865e-06, "loss": 0.80467379, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.53039288520813 }, { "auxiliary_loss_clip": 0.01176278, "auxiliary_loss_mlp": 0.01030003, "balance_loss_clip": 1.05625188, "balance_loss_mlp": 1.02203047, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 1.7157291951064704, "language_loss": 0.70254266, "learning_rate": 3.0743619640015203e-06, "loss": 0.72460544, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 3.4182803630828857 }, { "auxiliary_loss_clip": 0.01165959, "auxiliary_loss_mlp": 0.01037944, "balance_loss_clip": 1.05197906, "balance_loss_mlp": 1.02927744, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 1.9955731156893937, "language_loss": 0.92947441, "learning_rate": 3.073704845643999e-06, "loss": 0.95151347, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.52634596824646 }, { "auxiliary_loss_clip": 0.01176676, "auxiliary_loss_mlp": 0.01040316, "balance_loss_clip": 1.05217743, "balance_loss_mlp": 1.03164339, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 3.3029907220170065, "language_loss": 0.77702093, "learning_rate": 3.0730475644063063e-06, "loss": 0.79919082, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.4972875118255615 }, { "auxiliary_loss_clip": 0.01152056, "auxiliary_loss_mlp": 0.00761674, "balance_loss_clip": 1.0493679, "balance_loss_mlp": 1.00028718, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 1.86100310974661, "language_loss": 0.6520046, "learning_rate": 3.072390120388151e-06, "loss": 0.67114186, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 2.5432450771331787 }, { "auxiliary_loss_clip": 0.01177649, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.05566072, "balance_loss_mlp": 1.02739763, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 2.0940198208574152, "language_loss": 0.71006203, "learning_rate": 3.071732513689267e-06, "loss": 0.73220527, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 2.5226480960845947 }, { "auxiliary_loss_clip": 0.01177813, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.05777347, "balance_loss_mlp": 1.02562642, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 2.3294458236542357, "language_loss": 0.67156649, "learning_rate": 3.0710747444094134e-06, "loss": 0.69369066, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.498551607131958 }, { "auxiliary_loss_clip": 0.01166697, "auxiliary_loss_mlp": 0.01030942, "balance_loss_clip": 1.05585432, "balance_loss_mlp": 1.02193558, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 2.0546305736164494, "language_loss": 0.64696074, "learning_rate": 3.070416812648372e-06, "loss": 0.66893715, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 2.7838943004608154 }, { "auxiliary_loss_clip": 0.01144971, "auxiliary_loss_mlp": 0.01030355, "balance_loss_clip": 1.0482626, "balance_loss_mlp": 1.02218914, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 2.2514700970999577, "language_loss": 0.64593649, "learning_rate": 3.069758718505951e-06, "loss": 0.66768974, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.6428771018981934 }, { "auxiliary_loss_clip": 0.01190717, "auxiliary_loss_mlp": 0.01038, "balance_loss_clip": 1.05737376, "balance_loss_mlp": 1.02921391, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.602862020007956, "language_loss": 0.80054921, "learning_rate": 3.0691004620819836e-06, "loss": 0.8228364, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.530599355697632 }, { "auxiliary_loss_clip": 0.0104268, "auxiliary_loss_mlp": 0.01003975, "balance_loss_clip": 1.02423918, "balance_loss_mlp": 1.00251508, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.802397151761232, "language_loss": 0.60220003, "learning_rate": 3.0684420434763254e-06, "loss": 0.6226666, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.2236483097076416 }, { "auxiliary_loss_clip": 0.01138947, "auxiliary_loss_mlp": 0.01028595, "balance_loss_clip": 1.05140924, "balance_loss_mlp": 1.02095413, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 1.6965154559731834, "language_loss": 0.76980627, "learning_rate": 3.06778346278886e-06, "loss": 0.79148173, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.5683131217956543 }, { "auxiliary_loss_clip": 0.01192904, "auxiliary_loss_mlp": 0.01028243, "balance_loss_clip": 1.05972683, "balance_loss_mlp": 1.01948118, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 1.6446923067510189, "language_loss": 0.78823817, "learning_rate": 3.0671247201194906e-06, "loss": 0.8104496, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.5430715084075928 }, { "auxiliary_loss_clip": 0.01151785, "auxiliary_loss_mlp": 0.01033189, "balance_loss_clip": 1.05216396, "balance_loss_mlp": 1.02413535, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 2.009135435927844, "language_loss": 0.75689852, "learning_rate": 3.066465815568151e-06, "loss": 0.77874833, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.6277284622192383 }, { "auxiliary_loss_clip": 0.01178034, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.05331826, "balance_loss_mlp": 1.02374387, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 1.700372809016299, "language_loss": 0.68840528, "learning_rate": 3.0658067492347947e-06, "loss": 0.71051395, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.582031726837158 }, { "auxiliary_loss_clip": 0.011007, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.04895031, "balance_loss_mlp": 1.02226198, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 1.8531225065152177, "language_loss": 0.6694631, "learning_rate": 3.065147521219402e-06, "loss": 0.69077945, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.6534531116485596 }, { "auxiliary_loss_clip": 0.01153665, "auxiliary_loss_mlp": 0.01030281, "balance_loss_clip": 1.05413032, "balance_loss_mlp": 1.02260971, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.426722176702241, "language_loss": 0.74023664, "learning_rate": 3.064488131621977e-06, "loss": 0.76207614, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.7668306827545166 }, { "auxiliary_loss_clip": 0.01172887, "auxiliary_loss_mlp": 0.01037833, "balance_loss_clip": 1.05551434, "balance_loss_mlp": 1.02982235, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 2.0514195590819932, "language_loss": 0.73701549, "learning_rate": 3.063828580542549e-06, "loss": 0.75912273, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.587906837463379 }, { "auxiliary_loss_clip": 0.01165168, "auxiliary_loss_mlp": 0.01032619, "balance_loss_clip": 1.05492759, "balance_loss_mlp": 1.02457857, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 1.893643085613405, "language_loss": 0.73403263, "learning_rate": 3.0631688680811706e-06, "loss": 0.75601059, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 3.3214094638824463 }, { "auxiliary_loss_clip": 0.0119249, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.05659175, "balance_loss_mlp": 1.02208149, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 1.8121295838037192, "language_loss": 0.75232315, "learning_rate": 3.062508994337921e-06, "loss": 0.77455002, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.5559680461883545 }, { "auxiliary_loss_clip": 0.01175876, "auxiliary_loss_mlp": 0.01034293, "balance_loss_clip": 1.05421233, "balance_loss_mlp": 1.02475059, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 2.0255203308878897, "language_loss": 0.79191744, "learning_rate": 3.0618489594129013e-06, "loss": 0.81401908, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 2.5383517742156982 }, { "auxiliary_loss_clip": 0.01152829, "auxiliary_loss_mlp": 0.01039213, "balance_loss_clip": 1.05480444, "balance_loss_mlp": 1.03037977, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 1.985554383450499, "language_loss": 0.7089622, "learning_rate": 3.061188763406239e-06, "loss": 0.73088264, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.5748870372772217 }, { "auxiliary_loss_clip": 0.01160092, "auxiliary_loss_mlp": 0.01034729, "balance_loss_clip": 1.05452132, "balance_loss_mlp": 1.02525771, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 2.286083562027519, "language_loss": 0.82508421, "learning_rate": 3.060528406418085e-06, "loss": 0.84703243, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 2.622223377227783 }, { "auxiliary_loss_clip": 0.01155057, "auxiliary_loss_mlp": 0.01031394, "balance_loss_clip": 1.05324817, "balance_loss_mlp": 1.02378893, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 1.9795640356133584, "language_loss": 0.61918843, "learning_rate": 3.0598678885486145e-06, "loss": 0.6410529, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 2.708325147628784 }, { "auxiliary_loss_clip": 0.0114752, "auxiliary_loss_mlp": 0.00761769, "balance_loss_clip": 1.04964995, "balance_loss_mlp": 1.00029707, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.6692644985382394, "language_loss": 0.74580455, "learning_rate": 3.0592072098980282e-06, "loss": 0.76489741, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 3.3515524864196777 }, { "auxiliary_loss_clip": 0.01155259, "auxiliary_loss_mlp": 0.01036409, "balance_loss_clip": 1.05211473, "balance_loss_mlp": 1.02810073, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 1.7357771225804999, "language_loss": 0.72894919, "learning_rate": 3.0585463705665514e-06, "loss": 0.75086594, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 3.339571952819824 }, { "auxiliary_loss_clip": 0.01146664, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.0509665, "balance_loss_mlp": 1.02541327, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.3144846115702187, "language_loss": 0.70466191, "learning_rate": 3.0578853706544304e-06, "loss": 0.72646868, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.6213293075561523 }, { "auxiliary_loss_clip": 0.01151094, "auxiliary_loss_mlp": 0.00762408, "balance_loss_clip": 1.05276978, "balance_loss_mlp": 1.00028503, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 2.079107564018096, "language_loss": 0.65206051, "learning_rate": 3.0572242102619404e-06, "loss": 0.67119551, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 3.4474191665649414 }, { "auxiliary_loss_clip": 0.01164893, "auxiliary_loss_mlp": 0.01037592, "balance_loss_clip": 1.05714607, "balance_loss_mlp": 1.02923536, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 3.8136508447317925, "language_loss": 0.80387664, "learning_rate": 3.0565628894893784e-06, "loss": 0.82590151, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 2.615370273590088 }, { "auxiliary_loss_clip": 0.01169414, "auxiliary_loss_mlp": 0.01039949, "balance_loss_clip": 1.05549955, "balance_loss_mlp": 1.03111589, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 2.157918941321947, "language_loss": 0.74816394, "learning_rate": 3.0559014084370655e-06, "loss": 0.77025759, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.5045769214630127 }, { "auxiliary_loss_clip": 0.0116914, "auxiliary_loss_mlp": 0.0103999, "balance_loss_clip": 1.05488348, "balance_loss_mlp": 1.03076303, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 1.6457957070063196, "language_loss": 0.78553891, "learning_rate": 3.055239767205349e-06, "loss": 0.80763018, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 2.608302354812622 }, { "auxiliary_loss_clip": 0.01180788, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.06112814, "balance_loss_mlp": 1.02634954, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.9175001253014992, "language_loss": 0.7821793, "learning_rate": 3.054577965894599e-06, "loss": 0.80433691, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.4958202838897705 }, { "auxiliary_loss_clip": 0.01172673, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.05834937, "balance_loss_mlp": 1.0313344, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 2.607507147438953, "language_loss": 0.70414674, "learning_rate": 3.0539160046052094e-06, "loss": 0.72627962, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.592711925506592 }, { "auxiliary_loss_clip": 0.01154559, "auxiliary_loss_mlp": 0.0103763, "balance_loss_clip": 1.05229402, "balance_loss_mlp": 1.02805746, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 2.1551212831913644, "language_loss": 0.7042076, "learning_rate": 3.0532538834376003e-06, "loss": 0.72612953, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 2.543410301208496 }, { "auxiliary_loss_clip": 0.01184308, "auxiliary_loss_mlp": 0.01032663, "balance_loss_clip": 1.05665779, "balance_loss_mlp": 1.02385008, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 2.0530482106261903, "language_loss": 0.78173244, "learning_rate": 3.0525916024922143e-06, "loss": 0.80390209, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.5327742099761963 }, { "auxiliary_loss_clip": 0.01160181, "auxiliary_loss_mlp": 0.01031739, "balance_loss_clip": 1.0522635, "balance_loss_mlp": 1.02348423, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 2.74416779527112, "language_loss": 0.84634113, "learning_rate": 3.0519291618695193e-06, "loss": 0.86826032, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.512873411178589 }, { "auxiliary_loss_clip": 0.01135972, "auxiliary_loss_mlp": 0.01033796, "balance_loss_clip": 1.04670668, "balance_loss_mlp": 1.02531433, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 1.605653584109235, "language_loss": 0.75582016, "learning_rate": 3.0512665616700065e-06, "loss": 0.7775178, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.5660040378570557 }, { "auxiliary_loss_clip": 0.01124829, "auxiliary_loss_mlp": 0.01040939, "balance_loss_clip": 1.04654288, "balance_loss_mlp": 1.03178382, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 2.208463504641682, "language_loss": 0.88893104, "learning_rate": 3.0506038019941933e-06, "loss": 0.91058868, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.6123194694519043 }, { "auxiliary_loss_clip": 0.01152503, "auxiliary_loss_mlp": 0.01044163, "balance_loss_clip": 1.05508792, "balance_loss_mlp": 1.03495371, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 2.4922091281140397, "language_loss": 0.67449784, "learning_rate": 3.049940882942617e-06, "loss": 0.69646448, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.6509993076324463 }, { "auxiliary_loss_clip": 0.01191668, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.05774689, "balance_loss_mlp": 1.02364647, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 2.2966588028618347, "language_loss": 0.80542159, "learning_rate": 3.0492778046158448e-06, "loss": 0.82766384, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.5026814937591553 }, { "auxiliary_loss_clip": 0.0117909, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.05948448, "balance_loss_mlp": 1.02755952, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 2.7434743514803266, "language_loss": 0.7633732, "learning_rate": 3.0486145671144633e-06, "loss": 0.78552628, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.5402798652648926 }, { "auxiliary_loss_clip": 0.01100667, "auxiliary_loss_mlp": 0.01038936, "balance_loss_clip": 1.04558408, "balance_loss_mlp": 1.0299474, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 2.285007974990173, "language_loss": 0.77324098, "learning_rate": 3.047951170539086e-06, "loss": 0.79463702, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.6735188961029053 }, { "auxiliary_loss_clip": 0.01146246, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.05466199, "balance_loss_mlp": 1.02493763, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 1.8631617225080825, "language_loss": 0.84134984, "learning_rate": 3.047287614990349e-06, "loss": 0.86313963, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.598815441131592 }, { "auxiliary_loss_clip": 0.0115339, "auxiliary_loss_mlp": 0.01039917, "balance_loss_clip": 1.05296254, "balance_loss_mlp": 1.03088093, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 2.942548438883092, "language_loss": 0.61826527, "learning_rate": 3.046623900568914e-06, "loss": 0.64019835, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.720792531967163 }, { "auxiliary_loss_clip": 0.01157242, "auxiliary_loss_mlp": 0.0104119, "balance_loss_clip": 1.05133724, "balance_loss_mlp": 1.03134918, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 3.3139639472960547, "language_loss": 0.70045179, "learning_rate": 3.045960027375465e-06, "loss": 0.72243613, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 2.6303727626800537 }, { "auxiliary_loss_clip": 0.0118224, "auxiliary_loss_mlp": 0.01035027, "balance_loss_clip": 1.05481529, "balance_loss_mlp": 1.02666473, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 3.8583288313236626, "language_loss": 0.82627445, "learning_rate": 3.045295995510711e-06, "loss": 0.84844708, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 3.4023141860961914 }, { "auxiliary_loss_clip": 0.01159703, "auxiliary_loss_mlp": 0.01026248, "balance_loss_clip": 1.05464697, "balance_loss_mlp": 1.01860034, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 1.817959585658311, "language_loss": 0.73653615, "learning_rate": 3.0446318050753865e-06, "loss": 0.75839567, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 2.61146879196167 }, { "auxiliary_loss_clip": 0.01169738, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.05406547, "balance_loss_mlp": 1.02536976, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 2.2719004137655014, "language_loss": 0.77598071, "learning_rate": 3.0439674561702474e-06, "loss": 0.79800856, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 2.573303699493408 }, { "auxiliary_loss_clip": 0.01177314, "auxiliary_loss_mlp": 0.01036719, "balance_loss_clip": 1.05665421, "balance_loss_mlp": 1.02827883, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 2.222576806120435, "language_loss": 0.87929481, "learning_rate": 3.043302948896076e-06, "loss": 0.90143514, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.496525287628174 }, { "auxiliary_loss_clip": 0.01124002, "auxiliary_loss_mlp": 0.01037269, "balance_loss_clip": 1.050933, "balance_loss_mlp": 1.02807212, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 2.0787179683340393, "language_loss": 0.6029523, "learning_rate": 3.0426382833536756e-06, "loss": 0.62456501, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 2.721073627471924 }, { "auxiliary_loss_clip": 0.01141009, "auxiliary_loss_mlp": 0.01032114, "balance_loss_clip": 1.04827654, "balance_loss_mlp": 1.02369213, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 1.9925954619707191, "language_loss": 0.77910405, "learning_rate": 3.041973459643877e-06, "loss": 0.80083525, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 3.4137444496154785 }, { "auxiliary_loss_clip": 0.01125096, "auxiliary_loss_mlp": 0.01028455, "balance_loss_clip": 1.04619622, "balance_loss_mlp": 1.02003241, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 2.1972526276650415, "language_loss": 0.67016023, "learning_rate": 3.0413084778675334e-06, "loss": 0.69169575, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 3.3869800567626953 }, { "auxiliary_loss_clip": 0.01152927, "auxiliary_loss_mlp": 0.00761456, "balance_loss_clip": 1.04979467, "balance_loss_mlp": 1.00029004, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 3.85020938508407, "language_loss": 0.83856714, "learning_rate": 3.0406433381255214e-06, "loss": 0.85771102, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.6087992191314697 }, { "auxiliary_loss_clip": 0.01176354, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.05801678, "balance_loss_mlp": 1.02454448, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 2.2854980551959168, "language_loss": 0.82412827, "learning_rate": 3.0399780405187425e-06, "loss": 0.846223, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 3.3461132049560547 }, { "auxiliary_loss_clip": 0.01170945, "auxiliary_loss_mlp": 0.01036051, "balance_loss_clip": 1.05232525, "balance_loss_mlp": 1.0279572, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 1.9681873582175926, "language_loss": 0.78590351, "learning_rate": 3.0393125851481216e-06, "loss": 0.8079735, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.5765316486358643 }, { "auxiliary_loss_clip": 0.01143471, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.05329752, "balance_loss_mlp": 1.02148092, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 2.3105710559914803, "language_loss": 0.86268967, "learning_rate": 3.038646972114608e-06, "loss": 0.88442069, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.541077136993408 }, { "auxiliary_loss_clip": 0.01149061, "auxiliary_loss_mlp": 0.01039922, "balance_loss_clip": 1.05784321, "balance_loss_mlp": 1.03132725, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 2.492189286843929, "language_loss": 0.67583495, "learning_rate": 3.037981201519174e-06, "loss": 0.69772476, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 2.646679401397705 }, { "auxiliary_loss_clip": 0.01177203, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 1.05959392, "balance_loss_mlp": 1.0204916, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 1.99103174915717, "language_loss": 0.71375644, "learning_rate": 3.0373152734628175e-06, "loss": 0.73581278, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.5149483680725098 }, { "auxiliary_loss_clip": 0.01169244, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.05266118, "balance_loss_mlp": 1.02457762, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 2.0146061392046097, "language_loss": 0.76009762, "learning_rate": 3.0366491880465584e-06, "loss": 0.7821219, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 2.53934645652771 }, { "auxiliary_loss_clip": 0.01193781, "auxiliary_loss_mlp": 0.01045227, "balance_loss_clip": 1.05823076, "balance_loss_mlp": 1.03692985, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.4890128602858559, "language_loss": 0.81952977, "learning_rate": 3.035982945371443e-06, "loss": 0.8419199, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.516177177429199 }, { "auxiliary_loss_clip": 0.01172027, "auxiliary_loss_mlp": 0.01036625, "balance_loss_clip": 1.05747581, "balance_loss_mlp": 1.0285604, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 2.117832509133662, "language_loss": 0.85273528, "learning_rate": 3.035316545538537e-06, "loss": 0.8748219, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.5861568450927734 }, { "auxiliary_loss_clip": 0.0116115, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.05753851, "balance_loss_mlp": 1.02778959, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 3.5052503795249366, "language_loss": 0.79116368, "learning_rate": 3.034649988648935e-06, "loss": 0.81313157, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.5679469108581543 }, { "auxiliary_loss_clip": 0.01163963, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.05419993, "balance_loss_mlp": 1.02849448, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.637903499431211, "language_loss": 0.80743718, "learning_rate": 3.033983274803752e-06, "loss": 0.82944185, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.5939276218414307 }, { "auxiliary_loss_clip": 0.01158088, "auxiliary_loss_mlp": 0.01037794, "balance_loss_clip": 1.05203962, "balance_loss_mlp": 1.02955365, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 2.182814968388955, "language_loss": 0.72399664, "learning_rate": 3.0333164041041283e-06, "loss": 0.74595547, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.58298397064209 }, { "auxiliary_loss_clip": 0.0112093, "auxiliary_loss_mlp": 0.01028831, "balance_loss_clip": 1.04796851, "balance_loss_mlp": 1.02080846, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 1.733505955159925, "language_loss": 0.72082144, "learning_rate": 3.032649376651228e-06, "loss": 0.74231899, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.6508595943450928 }, { "auxiliary_loss_clip": 0.01149119, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 1.05201578, "balance_loss_mlp": 1.01880884, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 1.934230461291507, "language_loss": 0.75887275, "learning_rate": 3.031982192546238e-06, "loss": 0.78064221, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.6416945457458496 }, { "auxiliary_loss_clip": 0.01174891, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.05370033, "balance_loss_mlp": 1.02365315, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 2.5063923598455813, "language_loss": 0.94305003, "learning_rate": 3.0313148518903696e-06, "loss": 0.96511704, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.558743953704834 }, { "auxiliary_loss_clip": 0.01161799, "auxiliary_loss_mlp": 0.01029946, "balance_loss_clip": 1.05358434, "balance_loss_mlp": 1.02226019, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 1.9840017781610457, "language_loss": 0.81326824, "learning_rate": 3.030647354784859e-06, "loss": 0.83518577, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.529383659362793 }, { "auxiliary_loss_clip": 0.01143491, "auxiliary_loss_mlp": 0.01035493, "balance_loss_clip": 1.05040693, "balance_loss_mlp": 1.0278163, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 1.7967939834736704, "language_loss": 0.77679121, "learning_rate": 3.029979701330964e-06, "loss": 0.798581, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.5791220664978027 }, { "auxiliary_loss_clip": 0.01162493, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.05122352, "balance_loss_mlp": 1.02669334, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.389606533739234, "language_loss": 0.8031978, "learning_rate": 3.029311891629966e-06, "loss": 0.82516825, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.5644376277923584 }, { "auxiliary_loss_clip": 0.0116181, "auxiliary_loss_mlp": 0.01031157, "balance_loss_clip": 1.05474138, "balance_loss_mlp": 1.02345014, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 1.7441054471782214, "language_loss": 0.74288774, "learning_rate": 3.0286439257831744e-06, "loss": 0.76481742, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 3.414696216583252 }, { "auxiliary_loss_clip": 0.0119121, "auxiliary_loss_mlp": 0.01034536, "balance_loss_clip": 1.05538344, "balance_loss_mlp": 1.02366972, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 1.847366152925246, "language_loss": 0.7148751, "learning_rate": 3.0279758038919156e-06, "loss": 0.73713255, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.5234873294830322 }, { "auxiliary_loss_clip": 0.01176337, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.05487514, "balance_loss_mlp": 1.02268898, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 2.124292321172943, "language_loss": 0.78314304, "learning_rate": 3.0273075260575455e-06, "loss": 0.80522078, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 2.5328586101531982 }, { "auxiliary_loss_clip": 0.01163919, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.05322409, "balance_loss_mlp": 1.02617764, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 1.8180860629297626, "language_loss": 0.80721939, "learning_rate": 3.0266390923814396e-06, "loss": 0.82920849, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 2.571446657180786 }, { "auxiliary_loss_clip": 0.01163365, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.05548978, "balance_loss_mlp": 1.02611649, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 1.8417094361743132, "language_loss": 0.82101685, "learning_rate": 3.0259705029650008e-06, "loss": 0.84299314, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 2.515490770339966 }, { "auxiliary_loss_clip": 0.01174714, "auxiliary_loss_mlp": 0.01024733, "balance_loss_clip": 1.05273795, "balance_loss_mlp": 1.0166328, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 1.6746819025162287, "language_loss": 0.72999501, "learning_rate": 3.025301757909652e-06, "loss": 0.75198948, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 3.303964853286743 }, { "auxiliary_loss_clip": 0.01153466, "auxiliary_loss_mlp": 0.00762311, "balance_loss_clip": 1.05559707, "balance_loss_mlp": 1.00019193, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 2.2037908454618753, "language_loss": 0.80728281, "learning_rate": 3.024632857316842e-06, "loss": 0.82644057, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.656179189682007 }, { "auxiliary_loss_clip": 0.0117763, "auxiliary_loss_mlp": 0.0103502, "balance_loss_clip": 1.05547309, "balance_loss_mlp": 1.02607965, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 1.9503382955658362, "language_loss": 0.77595806, "learning_rate": 3.0239638012880412e-06, "loss": 0.79808462, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 3.2871875762939453 }, { "auxiliary_loss_clip": 0.01124763, "auxiliary_loss_mlp": 0.01036805, "balance_loss_clip": 1.04801464, "balance_loss_mlp": 1.0274353, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 3.055778145098356, "language_loss": 0.81653523, "learning_rate": 3.0232945899247466e-06, "loss": 0.83815086, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.5806732177734375 }, { "auxiliary_loss_clip": 0.01176442, "auxiliary_loss_mlp": 0.01033246, "balance_loss_clip": 1.053442, "balance_loss_mlp": 1.02340484, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 2.3887718298138165, "language_loss": 0.7776227, "learning_rate": 3.022625223328476e-06, "loss": 0.79971957, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 3.392902135848999 }, { "auxiliary_loss_clip": 0.01180573, "auxiliary_loss_mlp": 0.01035077, "balance_loss_clip": 1.05525625, "balance_loss_mlp": 1.0261898, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 1.4565634483543557, "language_loss": 0.69123, "learning_rate": 3.0219557016007723e-06, "loss": 0.71338654, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 2.5540504455566406 }, { "auxiliary_loss_clip": 0.01172369, "auxiliary_loss_mlp": 0.01035951, "balance_loss_clip": 1.05627489, "balance_loss_mlp": 1.02775514, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 1.8971957532594217, "language_loss": 0.69731164, "learning_rate": 3.021286024843202e-06, "loss": 0.71939486, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.5714049339294434 }, { "auxiliary_loss_clip": 0.01084242, "auxiliary_loss_mlp": 0.01002008, "balance_loss_clip": 1.02243543, "balance_loss_mlp": 1.00050628, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.0730551392409449, "language_loss": 0.64824623, "learning_rate": 3.0206161931573526e-06, "loss": 0.66910875, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 3.0165178775787354 }, { "auxiliary_loss_clip": 0.01160201, "auxiliary_loss_mlp": 0.01027775, "balance_loss_clip": 1.05241728, "balance_loss_mlp": 1.01940632, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.603293142953614, "language_loss": 0.92911679, "learning_rate": 3.0199462066448388e-06, "loss": 0.95099652, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.618485689163208 }, { "auxiliary_loss_clip": 0.01177419, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.05501533, "balance_loss_mlp": 1.02638614, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 1.9057643844446464, "language_loss": 0.69073731, "learning_rate": 3.019276065407296e-06, "loss": 0.71286088, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.54492449760437 }, { "auxiliary_loss_clip": 0.01135377, "auxiliary_loss_mlp": 0.01041635, "balance_loss_clip": 1.04908967, "balance_loss_mlp": 1.03248608, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 1.7276674538637886, "language_loss": 0.80226147, "learning_rate": 3.018605769546385e-06, "loss": 0.82403159, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 2.640115737915039 }, { "auxiliary_loss_clip": 0.01173031, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.05141878, "balance_loss_mlp": 1.0254575, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 1.838390232102658, "language_loss": 0.79569614, "learning_rate": 3.017935319163788e-06, "loss": 0.81777442, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.536867618560791 }, { "auxiliary_loss_clip": 0.01179602, "auxiliary_loss_mlp": 0.01034387, "balance_loss_clip": 1.05653012, "balance_loss_mlp": 1.02474296, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 1.612060217566858, "language_loss": 0.70679367, "learning_rate": 3.017264714361213e-06, "loss": 0.72893351, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.5598392486572266 }, { "auxiliary_loss_clip": 0.01165506, "auxiliary_loss_mlp": 0.00762733, "balance_loss_clip": 1.05621672, "balance_loss_mlp": 1.00020981, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 1.8788484529520266, "language_loss": 0.82255793, "learning_rate": 3.016593955240389e-06, "loss": 0.84184027, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.574967861175537 }, { "auxiliary_loss_clip": 0.01067809, "auxiliary_loss_mlp": 0.01003359, "balance_loss_clip": 1.01982403, "balance_loss_mlp": 1.00179768, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 0.8755624465262817, "language_loss": 0.63733232, "learning_rate": 3.015923041903071e-06, "loss": 0.6580441, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.158841848373413 }, { "auxiliary_loss_clip": 0.01176134, "auxiliary_loss_mlp": 0.01035973, "balance_loss_clip": 1.05754209, "balance_loss_mlp": 1.02739573, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 2.120033101365607, "language_loss": 0.83511508, "learning_rate": 3.0152519744510347e-06, "loss": 0.85723615, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.5870041847229004 }, { "auxiliary_loss_clip": 0.01148275, "auxiliary_loss_mlp": 0.01030226, "balance_loss_clip": 1.05151772, "balance_loss_mlp": 1.0223285, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 1.7671441554612437, "language_loss": 0.82473975, "learning_rate": 3.014580752986081e-06, "loss": 0.84652478, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.6295666694641113 }, { "auxiliary_loss_clip": 0.01139876, "auxiliary_loss_mlp": 0.0103365, "balance_loss_clip": 1.05331028, "balance_loss_mlp": 1.02477527, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 1.809485217244025, "language_loss": 0.78622371, "learning_rate": 3.0139093776100345e-06, "loss": 0.80795902, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.573765754699707 }, { "auxiliary_loss_clip": 0.01187152, "auxiliary_loss_mlp": 0.01029734, "balance_loss_clip": 1.05459833, "balance_loss_mlp": 1.02121079, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 1.7858777474090206, "language_loss": 0.75717229, "learning_rate": 3.013237848424741e-06, "loss": 0.77934122, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.4969635009765625 }, { "auxiliary_loss_clip": 0.0116601, "auxiliary_loss_mlp": 0.01032034, "balance_loss_clip": 1.05565119, "balance_loss_mlp": 1.02331424, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 2.0976379718590157, "language_loss": 0.75419998, "learning_rate": 3.012566165532072e-06, "loss": 0.77618045, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.5372917652130127 }, { "auxiliary_loss_clip": 0.01121798, "auxiliary_loss_mlp": 0.01037741, "balance_loss_clip": 1.04703665, "balance_loss_mlp": 1.02909279, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 2.362417253563913, "language_loss": 0.76344681, "learning_rate": 3.0118943290339207e-06, "loss": 0.78504217, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.6943583488464355 }, { "auxiliary_loss_clip": 0.01137411, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.04757547, "balance_loss_mlp": 1.02402592, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 1.8835208345519276, "language_loss": 0.68213606, "learning_rate": 3.011222339032204e-06, "loss": 0.70383757, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 2.566432476043701 }, { "auxiliary_loss_clip": 0.01188784, "auxiliary_loss_mlp": 0.01029696, "balance_loss_clip": 1.05564117, "balance_loss_mlp": 1.02138066, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 1.7066425958830471, "language_loss": 0.69068933, "learning_rate": 3.0105501956288626e-06, "loss": 0.71287411, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 3.3607406616210938 }, { "auxiliary_loss_clip": 0.01180773, "auxiliary_loss_mlp": 0.01036226, "balance_loss_clip": 1.05416286, "balance_loss_mlp": 1.0273087, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 2.0277476277628126, "language_loss": 0.72733462, "learning_rate": 3.0098778989258602e-06, "loss": 0.74950463, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 2.49186372756958 }, { "auxiliary_loss_clip": 0.01144304, "auxiliary_loss_mlp": 0.01035476, "balance_loss_clip": 1.05369949, "balance_loss_mlp": 1.02696455, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 1.889510064805231, "language_loss": 0.87913203, "learning_rate": 3.009205449025183e-06, "loss": 0.90092981, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 2.558403730392456 }, { "auxiliary_loss_clip": 0.01140903, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.04794025, "balance_loss_mlp": 1.02525568, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 1.7915308613918992, "language_loss": 0.63177657, "learning_rate": 3.008532846028842e-06, "loss": 0.65352905, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.5531232357025146 }, { "auxiliary_loss_clip": 0.01191542, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.0564661, "balance_loss_mlp": 1.02417362, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 3.409516309934858, "language_loss": 0.71690607, "learning_rate": 3.0078600900388694e-06, "loss": 0.73915988, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 3.291128635406494 }, { "auxiliary_loss_clip": 0.01137617, "auxiliary_loss_mlp": 0.01035003, "balance_loss_clip": 1.04903567, "balance_loss_mlp": 1.02615201, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 2.01244747493883, "language_loss": 0.74473602, "learning_rate": 3.007187181157323e-06, "loss": 0.76646227, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.6077446937561035 }, { "auxiliary_loss_clip": 0.01110074, "auxiliary_loss_mlp": 0.01034501, "balance_loss_clip": 1.04708838, "balance_loss_mlp": 1.02575696, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 2.312033122612613, "language_loss": 0.67987168, "learning_rate": 3.006514119486282e-06, "loss": 0.70131743, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 3.3855643272399902 }, { "auxiliary_loss_clip": 0.01141566, "auxiliary_loss_mlp": 0.01032619, "balance_loss_clip": 1.0508604, "balance_loss_mlp": 1.02389848, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 1.7722484288679574, "language_loss": 0.6946249, "learning_rate": 3.005840905127849e-06, "loss": 0.71636677, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 2.5466082096099854 }, { "auxiliary_loss_clip": 0.01193016, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.05859816, "balance_loss_mlp": 1.02678156, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 1.9581525035326448, "language_loss": 0.86863154, "learning_rate": 3.0051675381841516e-06, "loss": 0.89091384, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 3.301849603652954 }, { "auxiliary_loss_clip": 0.01105773, "auxiliary_loss_mlp": 0.007629, "balance_loss_clip": 1.04721129, "balance_loss_mlp": 1.00023627, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 2.0466272697636994, "language_loss": 0.77048445, "learning_rate": 3.0044940187573363e-06, "loss": 0.78917122, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 2.77781343460083 }, { "auxiliary_loss_clip": 0.01178587, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.05519295, "balance_loss_mlp": 1.02531815, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 2.003518602299335, "language_loss": 0.65293252, "learning_rate": 3.003820346949578e-06, "loss": 0.67505527, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.5514028072357178 }, { "auxiliary_loss_clip": 0.0118994, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.05498719, "balance_loss_mlp": 1.02052259, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 4.592078972732562, "language_loss": 0.79565358, "learning_rate": 3.003146522863071e-06, "loss": 0.81784773, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 2.521057605743408 }, { "auxiliary_loss_clip": 0.01158371, "auxiliary_loss_mlp": 0.01032998, "balance_loss_clip": 1.05329895, "balance_loss_mlp": 1.02453458, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 2.1492957780130686, "language_loss": 0.86619318, "learning_rate": 3.0024725466000345e-06, "loss": 0.88810682, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.643237829208374 }, { "auxiliary_loss_clip": 0.01175568, "auxiliary_loss_mlp": 0.01030614, "balance_loss_clip": 1.0558815, "balance_loss_mlp": 1.02198899, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.7831961873947628, "language_loss": 0.78956401, "learning_rate": 3.0017984182627087e-06, "loss": 0.81162584, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 2.5603106021881104 }, { "auxiliary_loss_clip": 0.01145684, "auxiliary_loss_mlp": 0.00762016, "balance_loss_clip": 1.05025578, "balance_loss_mlp": 1.00026357, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 1.9265162608489756, "language_loss": 0.82390761, "learning_rate": 3.00112413795336e-06, "loss": 0.84298462, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.642991065979004 }, { "auxiliary_loss_clip": 0.01157507, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.04713035, "balance_loss_mlp": 1.0241282, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 4.343893384890595, "language_loss": 0.79819477, "learning_rate": 3.000449705774275e-06, "loss": 0.8200953, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.5782315731048584 }, { "auxiliary_loss_clip": 0.0117694, "auxiliary_loss_mlp": 0.01034729, "balance_loss_clip": 1.05578876, "balance_loss_mlp": 1.02574611, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 2.078845268821874, "language_loss": 0.7118901, "learning_rate": 2.9997751218277654e-06, "loss": 0.73400682, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.5523881912231445 }, { "auxiliary_loss_clip": 0.01189812, "auxiliary_loss_mlp": 0.01040231, "balance_loss_clip": 1.05659461, "balance_loss_mlp": 1.03117716, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 2.044665502704415, "language_loss": 0.77633983, "learning_rate": 2.999100386216166e-06, "loss": 0.79864025, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.5185952186584473 }, { "auxiliary_loss_clip": 0.01160578, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.05370665, "balance_loss_mlp": 1.02943099, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 1.9533908391308672, "language_loss": 0.74176681, "learning_rate": 2.998425499041831e-06, "loss": 0.76375192, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.6211040019989014 }, { "auxiliary_loss_clip": 0.0106924, "auxiliary_loss_mlp": 0.01000788, "balance_loss_clip": 1.01942158, "balance_loss_mlp": 0.99932176, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.2767370655689447, "language_loss": 0.6450097, "learning_rate": 2.997750460407142e-06, "loss": 0.66571003, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.1571106910705566 }, { "auxiliary_loss_clip": 0.0115147, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.05029416, "balance_loss_mlp": 1.02273953, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 2.8357653571367494, "language_loss": 0.7010628, "learning_rate": 2.997075270414501e-06, "loss": 0.72290045, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.6431119441986084 }, { "auxiliary_loss_clip": 0.0105692, "auxiliary_loss_mlp": 0.01000227, "balance_loss_clip": 1.01843476, "balance_loss_mlp": 0.99872512, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.7002394363677609, "language_loss": 0.57798016, "learning_rate": 2.9963999291663347e-06, "loss": 0.59855163, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.1758339405059814 }, { "auxiliary_loss_clip": 0.01135786, "auxiliary_loss_mlp": 0.01043116, "balance_loss_clip": 1.05425501, "balance_loss_mlp": 1.03413403, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 4.088943333125777, "language_loss": 0.73735374, "learning_rate": 2.9957244367650915e-06, "loss": 0.75914276, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.6610565185546875 }, { "auxiliary_loss_clip": 0.01127398, "auxiliary_loss_mlp": 0.01034888, "balance_loss_clip": 1.0508368, "balance_loss_mlp": 1.02632904, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 1.7881194737641446, "language_loss": 0.83810824, "learning_rate": 2.9950487933132425e-06, "loss": 0.85973114, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.6404831409454346 }, { "auxiliary_loss_clip": 0.0117957, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.05469894, "balance_loss_mlp": 1.02360606, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 2.1695012996894176, "language_loss": 0.71374333, "learning_rate": 2.994372998913283e-06, "loss": 0.73586154, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.5512475967407227 }, { "auxiliary_loss_clip": 0.01165056, "auxiliary_loss_mlp": 0.01031547, "balance_loss_clip": 1.05547619, "balance_loss_mlp": 1.0229466, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 2.2683438768037174, "language_loss": 0.62379706, "learning_rate": 2.99369705366773e-06, "loss": 0.64576316, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 2.6156389713287354 }, { "auxiliary_loss_clip": 0.01159651, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.0554136, "balance_loss_mlp": 1.02622294, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 2.5718085886106468, "language_loss": 0.81652641, "learning_rate": 2.9930209576791244e-06, "loss": 0.83847463, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 3.344034433364868 }, { "auxiliary_loss_clip": 0.01172302, "auxiliary_loss_mlp": 0.0102576, "balance_loss_clip": 1.05344033, "balance_loss_mlp": 1.01729667, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 1.9243288775532592, "language_loss": 0.63841122, "learning_rate": 2.9923447110500285e-06, "loss": 0.66039187, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 2.553407907485962 }, { "auxiliary_loss_clip": 0.01165561, "auxiliary_loss_mlp": 0.01033064, "balance_loss_clip": 1.05339217, "balance_loss_mlp": 1.02464139, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 1.9035049127752064, "language_loss": 0.75243121, "learning_rate": 2.9916683138830295e-06, "loss": 0.77441746, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 2.6091485023498535 }, { "auxiliary_loss_clip": 0.01156955, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 1.05264997, "balance_loss_mlp": 1.02661121, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 2.5647282678311902, "language_loss": 0.80928093, "learning_rate": 2.9909917662807353e-06, "loss": 0.8312031, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 2.550844669342041 }, { "auxiliary_loss_clip": 0.01174445, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.05368924, "balance_loss_mlp": 1.02660751, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.2616642409585967, "language_loss": 0.6871618, "learning_rate": 2.9903150683457783e-06, "loss": 0.70926493, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 3.2935280799865723 }, { "auxiliary_loss_clip": 0.01159794, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.05193734, "balance_loss_mlp": 1.02556717, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 2.3100128594749973, "language_loss": 0.65141284, "learning_rate": 2.9896382201808126e-06, "loss": 0.67335367, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.5502383708953857 }, { "auxiliary_loss_clip": 0.01189377, "auxiliary_loss_mlp": 0.01035686, "balance_loss_clip": 1.05516315, "balance_loss_mlp": 1.02727556, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 2.3032517855367116, "language_loss": 0.80947065, "learning_rate": 2.988961221888516e-06, "loss": 0.83172125, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 3.2146048545837402 }, { "auxiliary_loss_clip": 0.01137055, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.04907727, "balance_loss_mlp": 1.0244565, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 2.293735216558372, "language_loss": 0.79562759, "learning_rate": 2.988284073571589e-06, "loss": 0.81732637, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.5867269039154053 }, { "auxiliary_loss_clip": 0.01176702, "auxiliary_loss_mlp": 0.00762334, "balance_loss_clip": 1.05434501, "balance_loss_mlp": 1.00024354, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.2558053273478493, "language_loss": 0.7298156, "learning_rate": 2.9876067753327528e-06, "loss": 0.74920595, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 3.405550241470337 }, { "auxiliary_loss_clip": 0.01178228, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.05359542, "balance_loss_mlp": 1.02793908, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 1.813804692749209, "language_loss": 0.80463111, "learning_rate": 2.986929327274754e-06, "loss": 0.82678068, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 2.671753168106079 }, { "auxiliary_loss_clip": 0.01177592, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.05780411, "balance_loss_mlp": 1.02641261, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.6137583906021165, "language_loss": 0.7864399, "learning_rate": 2.9862517295003617e-06, "loss": 0.80856353, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.622518539428711 }, { "auxiliary_loss_clip": 0.01144889, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.04857564, "balance_loss_mlp": 1.02237535, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 1.5835469987429431, "language_loss": 0.72495151, "learning_rate": 2.9855739821123654e-06, "loss": 0.74671113, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.676173448562622 }, { "auxiliary_loss_clip": 0.01171813, "auxiliary_loss_mlp": 0.01038462, "balance_loss_clip": 1.05440366, "balance_loss_mlp": 1.03019524, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 1.7510492202817691, "language_loss": 0.817711, "learning_rate": 2.98489608521358e-06, "loss": 0.83981377, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.60746431350708 }, { "auxiliary_loss_clip": 0.0117817, "auxiliary_loss_mlp": 0.00761882, "balance_loss_clip": 1.0548445, "balance_loss_mlp": 1.00024629, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 2.0942067109982823, "language_loss": 0.79591548, "learning_rate": 2.9842180389068425e-06, "loss": 0.81531596, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.5739121437072754 }, { "auxiliary_loss_clip": 0.01048691, "auxiliary_loss_mlp": 0.01014327, "balance_loss_clip": 1.02378392, "balance_loss_mlp": 1.01273, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 0.8495407590465676, "language_loss": 0.59286582, "learning_rate": 2.98353984329501e-06, "loss": 0.61349607, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 3.207019329071045 }, { "auxiliary_loss_clip": 0.01159567, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.05335808, "balance_loss_mlp": 1.02323914, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.7273145152354126, "language_loss": 0.70577037, "learning_rate": 2.982861498480965e-06, "loss": 0.72768784, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.6194419860839844 }, { "auxiliary_loss_clip": 0.01141984, "auxiliary_loss_mlp": 0.01027007, "balance_loss_clip": 1.04895496, "balance_loss_mlp": 1.01867461, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 2.2327427914568596, "language_loss": 0.82850099, "learning_rate": 2.9821830045676122e-06, "loss": 0.85019088, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.645923614501953 }, { "auxiliary_loss_clip": 0.01191524, "auxiliary_loss_mlp": 0.01032342, "balance_loss_clip": 1.05873811, "balance_loss_mlp": 1.02355671, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 1.784800908701464, "language_loss": 0.73194659, "learning_rate": 2.9815043616578793e-06, "loss": 0.75418532, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.591174840927124 }, { "auxiliary_loss_clip": 0.01143833, "auxiliary_loss_mlp": 0.01031014, "balance_loss_clip": 1.04928684, "balance_loss_mlp": 1.02238297, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 2.175309034732082, "language_loss": 0.76878512, "learning_rate": 2.9808255698547145e-06, "loss": 0.79053354, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.7601420879364014 }, { "auxiliary_loss_clip": 0.01176242, "auxiliary_loss_mlp": 0.01034961, "balance_loss_clip": 1.05749846, "balance_loss_mlp": 1.02640784, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 2.1436998181284075, "language_loss": 0.79607993, "learning_rate": 2.9801466292610913e-06, "loss": 0.81819201, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.564422845840454 }, { "auxiliary_loss_clip": 0.01174084, "auxiliary_loss_mlp": 0.01029992, "balance_loss_clip": 1.05403316, "balance_loss_mlp": 1.02167714, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 2.2936440843313104, "language_loss": 0.80855602, "learning_rate": 2.979467539980003e-06, "loss": 0.8305968, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.5264787673950195 }, { "auxiliary_loss_clip": 0.0117695, "auxiliary_loss_mlp": 0.0103043, "balance_loss_clip": 1.0564481, "balance_loss_mlp": 1.02161467, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 1.8312404226844985, "language_loss": 0.77181017, "learning_rate": 2.978788302114468e-06, "loss": 0.79388392, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.568730354309082 }, { "auxiliary_loss_clip": 0.01173521, "auxiliary_loss_mlp": 0.01025746, "balance_loss_clip": 1.05625439, "balance_loss_mlp": 1.01730025, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 2.0877345239337695, "language_loss": 0.82975709, "learning_rate": 2.9781089157675255e-06, "loss": 0.85174978, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.6517412662506104 }, { "auxiliary_loss_clip": 0.01168861, "auxiliary_loss_mlp": 0.01032811, "balance_loss_clip": 1.05444157, "balance_loss_mlp": 1.02417445, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 1.5180517220456982, "language_loss": 0.88257861, "learning_rate": 2.977429381042238e-06, "loss": 0.90459526, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.604482650756836 }, { "auxiliary_loss_clip": 0.01161949, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.05378127, "balance_loss_mlp": 1.02218056, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.0951629530708553, "language_loss": 0.88458514, "learning_rate": 2.9767496980416913e-06, "loss": 0.90650558, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.612957000732422 }, { "auxiliary_loss_clip": 0.01153241, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.04934359, "balance_loss_mlp": 1.02502751, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 3.262699544296505, "language_loss": 0.81082118, "learning_rate": 2.9760698668689914e-06, "loss": 0.8326928, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 2.5223615169525146 }, { "auxiliary_loss_clip": 0.01174418, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.05356789, "balance_loss_mlp": 1.0221802, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 3.2470123520484417, "language_loss": 0.71305203, "learning_rate": 2.975389887627269e-06, "loss": 0.73510247, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 3.444403648376465 }, { "auxiliary_loss_clip": 0.01148396, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.05117178, "balance_loss_mlp": 1.02978635, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.4954237047389833, "language_loss": 0.89835173, "learning_rate": 2.9747097604196764e-06, "loss": 0.9202106, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 2.5825753211975098 }, { "auxiliary_loss_clip": 0.01036976, "auxiliary_loss_mlp": 0.01006896, "balance_loss_clip": 1.01872742, "balance_loss_mlp": 1.0051434, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.673228561617856, "language_loss": 0.56643474, "learning_rate": 2.9740294853493875e-06, "loss": 0.58687347, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 3.401665687561035 }, { "auxiliary_loss_clip": 0.01137438, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.05121922, "balance_loss_mlp": 1.02446568, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 1.859368263692074, "language_loss": 0.6712957, "learning_rate": 2.9733490625196008e-06, "loss": 0.6929974, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 2.7114243507385254 }, { "auxiliary_loss_clip": 0.01132467, "auxiliary_loss_mlp": 0.01029218, "balance_loss_clip": 1.04806888, "balance_loss_mlp": 1.02137947, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 2.688448538533833, "language_loss": 0.7522375, "learning_rate": 2.9726684920335353e-06, "loss": 0.77385437, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 3.4021072387695312 }, { "auxiliary_loss_clip": 0.0119276, "auxiliary_loss_mlp": 0.00762278, "balance_loss_clip": 1.05610502, "balance_loss_mlp": 1.0002811, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.0402853269433128, "language_loss": 0.81934845, "learning_rate": 2.971987773994432e-06, "loss": 0.83889884, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.528413772583008 }, { "auxiliary_loss_clip": 0.01166342, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.05132878, "balance_loss_mlp": 1.02924132, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 1.7976744140797722, "language_loss": 0.83427614, "learning_rate": 2.9713069085055566e-06, "loss": 0.85632086, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 3.322321653366089 }, { "auxiliary_loss_clip": 0.01148129, "auxiliary_loss_mlp": 0.01025828, "balance_loss_clip": 1.0541656, "balance_loss_mlp": 1.01801395, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 1.526973959550334, "language_loss": 0.78915197, "learning_rate": 2.9706258956701958e-06, "loss": 0.81089151, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 3.4460666179656982 }, { "auxiliary_loss_clip": 0.01177297, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 1.05557704, "balance_loss_mlp": 1.02101278, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 2.8122019572984067, "language_loss": 0.77490187, "learning_rate": 2.9699447355916575e-06, "loss": 0.79697007, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 2.543665885925293 }, { "auxiliary_loss_clip": 0.01187042, "auxiliary_loss_mlp": 0.00761806, "balance_loss_clip": 1.05544615, "balance_loss_mlp": 1.00024033, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 2.0605366396313625, "language_loss": 0.74082214, "learning_rate": 2.969263428373275e-06, "loss": 0.76031059, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 2.5341405868530273 }, { "auxiliary_loss_clip": 0.01162012, "auxiliary_loss_mlp": 0.01029131, "balance_loss_clip": 1.05232584, "balance_loss_mlp": 1.02140582, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 2.107418851265342, "language_loss": 0.79423696, "learning_rate": 2.9685819741184007e-06, "loss": 0.8161484, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.5411946773529053 }, { "auxiliary_loss_clip": 0.01139104, "auxiliary_loss_mlp": 0.01028242, "balance_loss_clip": 1.05003095, "balance_loss_mlp": 1.02052939, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 2.357496905791042, "language_loss": 0.68380487, "learning_rate": 2.967900372930411e-06, "loss": 0.70547831, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.6030380725860596 }, { "auxiliary_loss_clip": 0.01153811, "auxiliary_loss_mlp": 0.01038882, "balance_loss_clip": 1.05075741, "balance_loss_mlp": 1.03059125, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 2.053023237586477, "language_loss": 0.78678519, "learning_rate": 2.9672186249127046e-06, "loss": 0.80871212, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.5542361736297607 }, { "auxiliary_loss_clip": 0.01160766, "auxiliary_loss_mlp": 0.01025244, "balance_loss_clip": 1.05322397, "balance_loss_mlp": 1.01753736, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 1.9234653587917128, "language_loss": 0.78805345, "learning_rate": 2.9665367301687014e-06, "loss": 0.80991352, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.622885227203369 }, { "auxiliary_loss_clip": 0.01148338, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.04827297, "balance_loss_mlp": 1.02348328, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 1.8680717853947078, "language_loss": 0.76733184, "learning_rate": 2.965854688801845e-06, "loss": 0.78913116, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 2.6460471153259277 }, { "auxiliary_loss_clip": 0.01169141, "auxiliary_loss_mlp": 0.0102596, "balance_loss_clip": 1.04963684, "balance_loss_mlp": 1.01817584, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 2.024081973352873, "language_loss": 0.76415765, "learning_rate": 2.9651725009156005e-06, "loss": 0.78610861, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.5363292694091797 }, { "auxiliary_loss_clip": 0.01148502, "auxiliary_loss_mlp": 0.01036132, "balance_loss_clip": 1.04971313, "balance_loss_mlp": 1.02861595, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 1.6188057072754765, "language_loss": 0.74199027, "learning_rate": 2.964490166613454e-06, "loss": 0.76383662, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.581660509109497 }, { "auxiliary_loss_clip": 0.01083336, "auxiliary_loss_mlp": 0.01003505, "balance_loss_clip": 1.02283013, "balance_loss_mlp": 1.00199747, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7593158470077602, "language_loss": 0.57708901, "learning_rate": 2.963807685998917e-06, "loss": 0.59795743, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 2.9290308952331543 }, { "auxiliary_loss_clip": 0.01134086, "auxiliary_loss_mlp": 0.01027848, "balance_loss_clip": 1.04892874, "balance_loss_mlp": 1.02049291, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 1.5105335910541742, "language_loss": 0.7806412, "learning_rate": 2.9631250591755196e-06, "loss": 0.80226058, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 2.8622219562530518 }, { "auxiliary_loss_clip": 0.01153637, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.05298352, "balance_loss_mlp": 1.02390993, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 1.834005016182697, "language_loss": 0.57290375, "learning_rate": 2.962442286246817e-06, "loss": 0.59475827, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.7167856693267822 }, { "auxiliary_loss_clip": 0.01161339, "auxiliary_loss_mlp": 0.01027333, "balance_loss_clip": 1.05301905, "balance_loss_mlp": 1.01891065, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 1.5615388103299124, "language_loss": 0.69543469, "learning_rate": 2.9617593673163853e-06, "loss": 0.7173214, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.611388683319092 }, { "auxiliary_loss_clip": 0.01161376, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.050843, "balance_loss_mlp": 1.02036738, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.1806970324604, "language_loss": 0.77278101, "learning_rate": 2.9610763024878216e-06, "loss": 0.79467416, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.559418201446533 }, { "auxiliary_loss_clip": 0.01151828, "auxiliary_loss_mlp": 0.01028868, "balance_loss_clip": 1.04971063, "balance_loss_mlp": 1.0203979, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 1.7239202851211788, "language_loss": 0.91831505, "learning_rate": 2.960393091864747e-06, "loss": 0.94012201, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.577052593231201 }, { "auxiliary_loss_clip": 0.01159764, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.0534693, "balance_loss_mlp": 1.02330065, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 1.8620274822251992, "language_loss": 0.74873924, "learning_rate": 2.959709735550804e-06, "loss": 0.77065009, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.622349977493286 }, { "auxiliary_loss_clip": 0.0113265, "auxiliary_loss_mlp": 0.01030815, "balance_loss_clip": 1.04690671, "balance_loss_mlp": 1.02265549, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 2.1727884090421865, "language_loss": 0.75719935, "learning_rate": 2.9590262336496575e-06, "loss": 0.77883404, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.6729953289031982 }, { "auxiliary_loss_clip": 0.01141071, "auxiliary_loss_mlp": 0.01035089, "balance_loss_clip": 1.05153251, "balance_loss_mlp": 1.02623188, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 1.9370419412350122, "language_loss": 0.85378653, "learning_rate": 2.9583425862649936e-06, "loss": 0.87554812, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 2.6145782470703125 }, { "auxiliary_loss_clip": 0.01189452, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.05695057, "balance_loss_mlp": 1.02455389, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 2.394161443086213, "language_loss": 0.7371887, "learning_rate": 2.9576587935005215e-06, "loss": 0.75941342, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 3.3059401512145996 }, { "auxiliary_loss_clip": 0.0117561, "auxiliary_loss_mlp": 0.01029698, "balance_loss_clip": 1.05473554, "balance_loss_mlp": 1.02119195, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 2.3422659213003714, "language_loss": 0.72254425, "learning_rate": 2.9569748554599713e-06, "loss": 0.74459732, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 2.5386037826538086 }, { "auxiliary_loss_clip": 0.01160657, "auxiliary_loss_mlp": 0.01029221, "balance_loss_clip": 1.0539484, "balance_loss_mlp": 1.02106071, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 1.963416834705708, "language_loss": 0.73280585, "learning_rate": 2.956290772247097e-06, "loss": 0.75470459, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 2.777763605117798 }, { "auxiliary_loss_clip": 0.01124196, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.0501498, "balance_loss_mlp": 1.02540123, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 2.012552454645506, "language_loss": 0.73042154, "learning_rate": 2.9556065439656724e-06, "loss": 0.75199604, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 2.6723005771636963 }, { "auxiliary_loss_clip": 0.01105358, "auxiliary_loss_mlp": 0.01031262, "balance_loss_clip": 1.04274321, "balance_loss_mlp": 1.02392435, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 1.6636088509979885, "language_loss": 0.81939244, "learning_rate": 2.9549221707194952e-06, "loss": 0.84075868, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 3.427976369857788 }, { "auxiliary_loss_clip": 0.01174077, "auxiliary_loss_mlp": 0.01026053, "balance_loss_clip": 1.05388761, "balance_loss_mlp": 1.01808429, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 6.230958789280565, "language_loss": 0.72599262, "learning_rate": 2.954237652612384e-06, "loss": 0.74799389, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.624509811401367 }, { "auxiliary_loss_clip": 0.01155901, "auxiliary_loss_mlp": 0.01031463, "balance_loss_clip": 1.0528264, "balance_loss_mlp": 1.02391732, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 2.1223855815370523, "language_loss": 0.84521031, "learning_rate": 2.9535529897481796e-06, "loss": 0.86708397, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 3.272669553756714 }, { "auxiliary_loss_clip": 0.01187191, "auxiliary_loss_mlp": 0.01033454, "balance_loss_clip": 1.05449343, "balance_loss_mlp": 1.02481771, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.343332141191926, "language_loss": 0.76704162, "learning_rate": 2.9528681822307446e-06, "loss": 0.78924811, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 3.6193058490753174 }, { "auxiliary_loss_clip": 0.01173071, "auxiliary_loss_mlp": 0.00760876, "balance_loss_clip": 1.05787468, "balance_loss_mlp": 1.0002985, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 1.9610629955517311, "language_loss": 0.82151628, "learning_rate": 2.952183230163964e-06, "loss": 0.84085578, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 2.6674630641937256 }, { "auxiliary_loss_clip": 0.0113949, "auxiliary_loss_mlp": 0.01027984, "balance_loss_clip": 1.04947722, "balance_loss_mlp": 1.02021158, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 2.1159372241094805, "language_loss": 0.73341405, "learning_rate": 2.9514981336517448e-06, "loss": 0.75508881, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.6961112022399902 }, { "auxiliary_loss_clip": 0.0117384, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.05710196, "balance_loss_mlp": 1.02140844, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 5.047529039866728, "language_loss": 0.81380534, "learning_rate": 2.950812892798015e-06, "loss": 0.83584082, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.631805896759033 }, { "auxiliary_loss_clip": 0.01127869, "auxiliary_loss_mlp": 0.00761007, "balance_loss_clip": 1.05235457, "balance_loss_mlp": 1.00029182, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 1.9554608628898114, "language_loss": 0.87154818, "learning_rate": 2.9501275077067256e-06, "loss": 0.89043695, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.76419997215271 }, { "auxiliary_loss_clip": 0.01097907, "auxiliary_loss_mlp": 0.01033944, "balance_loss_clip": 1.043275, "balance_loss_mlp": 1.02600455, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.4234003060072937, "language_loss": 0.88336098, "learning_rate": 2.949441978481848e-06, "loss": 0.90467954, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 2.8157968521118164 }, { "auxiliary_loss_clip": 0.0114741, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.0501188, "balance_loss_mlp": 1.02022409, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 1.845317238759066, "language_loss": 0.79953539, "learning_rate": 2.9487563052273778e-06, "loss": 0.82129514, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.590996026992798 }, { "auxiliary_loss_clip": 0.0116998, "auxiliary_loss_mlp": 0.01028507, "balance_loss_clip": 1.05644679, "balance_loss_mlp": 1.02057362, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 1.774398609047121, "language_loss": 0.85661811, "learning_rate": 2.94807048804733e-06, "loss": 0.87860298, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 2.557276964187622 }, { "auxiliary_loss_clip": 0.01145704, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.04830551, "balance_loss_mlp": 1.02128267, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 1.7973603499401292, "language_loss": 0.90366513, "learning_rate": 2.9473845270457434e-06, "loss": 0.9254207, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.6244359016418457 }, { "auxiliary_loss_clip": 0.01152559, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.05170941, "balance_loss_mlp": 1.02199697, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 2.2668606074741415, "language_loss": 0.69512284, "learning_rate": 2.946698422326677e-06, "loss": 0.71695197, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 2.580044746398926 }, { "auxiliary_loss_clip": 0.01127419, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.04691052, "balance_loss_mlp": 1.03058541, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 2.0801829335386484, "language_loss": 0.79654753, "learning_rate": 2.946012173994213e-06, "loss": 0.81820536, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.6965670585632324 }, { "auxiliary_loss_clip": 0.01169839, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.05649805, "balance_loss_mlp": 1.0275315, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 1.4205675411924588, "language_loss": 0.67735088, "learning_rate": 2.945325782152454e-06, "loss": 0.69940412, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.6987364292144775 }, { "auxiliary_loss_clip": 0.01156298, "auxiliary_loss_mlp": 0.01027704, "balance_loss_clip": 1.04832327, "balance_loss_mlp": 1.01970553, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 2.357773480886045, "language_loss": 0.78539896, "learning_rate": 2.9446392469055257e-06, "loss": 0.80723894, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.5559279918670654 }, { "auxiliary_loss_clip": 0.0113941, "auxiliary_loss_mlp": 0.01027221, "balance_loss_clip": 1.05426657, "balance_loss_mlp": 1.0189476, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 1.8399831065155903, "language_loss": 0.7978043, "learning_rate": 2.9439525683575745e-06, "loss": 0.81947064, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.586909294128418 }, { "auxiliary_loss_clip": 0.01193558, "auxiliary_loss_mlp": 0.01032152, "balance_loss_clip": 1.05860984, "balance_loss_mlp": 1.02362847, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 2.0230933532387465, "language_loss": 0.75097662, "learning_rate": 2.9432657466127694e-06, "loss": 0.77323371, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.5171566009521484 }, { "auxiliary_loss_clip": 0.01133388, "auxiliary_loss_mlp": 0.01029043, "balance_loss_clip": 1.05494785, "balance_loss_mlp": 1.0205673, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 1.685789417394078, "language_loss": 0.76334143, "learning_rate": 2.9425787817753007e-06, "loss": 0.78496575, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.6426382064819336 }, { "auxiliary_loss_clip": 0.01146917, "auxiliary_loss_mlp": 0.01029321, "balance_loss_clip": 1.05270076, "balance_loss_mlp": 1.0210954, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.4920897406731028, "language_loss": 0.7159313, "learning_rate": 2.94189167394938e-06, "loss": 0.73769379, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.6681315898895264 }, { "auxiliary_loss_clip": 0.0119118, "auxiliary_loss_mlp": 0.01035404, "balance_loss_clip": 1.05970085, "balance_loss_mlp": 1.02731538, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 2.1030748232493672, "language_loss": 0.81061196, "learning_rate": 2.941204423239241e-06, "loss": 0.83287781, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.5022130012512207 }, { "auxiliary_loss_clip": 0.01170257, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.05501378, "balance_loss_mlp": 1.02262819, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 1.7712426920408302, "language_loss": 0.75934488, "learning_rate": 2.9405170297491395e-06, "loss": 0.78135371, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 2.607408285140991 }, { "auxiliary_loss_clip": 0.01110813, "auxiliary_loss_mlp": 0.0076235, "balance_loss_clip": 1.05337465, "balance_loss_mlp": 1.00025439, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 1.9337125131528317, "language_loss": 0.80228269, "learning_rate": 2.939829493583353e-06, "loss": 0.82101429, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 3.3907041549682617 }, { "auxiliary_loss_clip": 0.01137575, "auxiliary_loss_mlp": 0.01029263, "balance_loss_clip": 1.04636896, "balance_loss_mlp": 1.0215888, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.6109996943259115, "language_loss": 0.83565295, "learning_rate": 2.939141814846179e-06, "loss": 0.85732126, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 2.612678289413452 }, { "auxiliary_loss_clip": 0.01159242, "auxiliary_loss_mlp": 0.01028346, "balance_loss_clip": 1.05152249, "balance_loss_mlp": 1.02017987, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 1.9006511532324148, "language_loss": 0.82323146, "learning_rate": 2.938453993641938e-06, "loss": 0.84510732, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 2.5615835189819336 }, { "auxiliary_loss_clip": 0.01161073, "auxiliary_loss_mlp": 0.01037701, "balance_loss_clip": 1.05670655, "balance_loss_mlp": 1.0288918, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.078396979175287, "language_loss": 0.70102203, "learning_rate": 2.937766030074973e-06, "loss": 0.72300977, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 2.5462467670440674 }, { "auxiliary_loss_clip": 0.01147677, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.05021048, "balance_loss_mlp": 1.02633977, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 1.76962567156387, "language_loss": 0.8271904, "learning_rate": 2.937077924249646e-06, "loss": 0.84902012, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 3.5059802532196045 }, { "auxiliary_loss_clip": 0.01165657, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.05444336, "balance_loss_mlp": 1.0263257, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 1.9728971158033006, "language_loss": 0.75358099, "learning_rate": 2.9363896762703443e-06, "loss": 0.77558398, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 2.570932626724243 }, { "auxiliary_loss_clip": 0.01188617, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.05628633, "balance_loss_mlp": 1.02251053, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 1.7226941857922682, "language_loss": 0.84365773, "learning_rate": 2.9357012862414725e-06, "loss": 0.86585635, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 3.2878799438476562 }, { "auxiliary_loss_clip": 0.01173918, "auxiliary_loss_mlp": 0.01027106, "balance_loss_clip": 1.05484903, "balance_loss_mlp": 1.01920271, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 2.2298614783992896, "language_loss": 0.72076815, "learning_rate": 2.9350127542674593e-06, "loss": 0.74277836, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 3.477714776992798 }, { "auxiliary_loss_clip": 0.0116331, "auxiliary_loss_mlp": 0.0103039, "balance_loss_clip": 1.05413556, "balance_loss_mlp": 1.02211702, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 1.7836179630980373, "language_loss": 0.76434505, "learning_rate": 2.934324080452755e-06, "loss": 0.78628212, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 2.587925910949707 }, { "auxiliary_loss_clip": 0.01137674, "auxiliary_loss_mlp": 0.00763068, "balance_loss_clip": 1.04944229, "balance_loss_mlp": 1.00021601, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 1.681275716849011, "language_loss": 0.78212619, "learning_rate": 2.9336352649018307e-06, "loss": 0.80113357, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 2.660174608230591 }, { "auxiliary_loss_clip": 0.01160802, "auxiliary_loss_mlp": 0.0103413, "balance_loss_clip": 1.05348969, "balance_loss_mlp": 1.02589321, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 1.5809240629600054, "language_loss": 0.69975024, "learning_rate": 2.9329463077191783e-06, "loss": 0.7216996, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.6926467418670654 }, { "auxiliary_loss_clip": 0.01133627, "auxiliary_loss_mlp": 0.01027345, "balance_loss_clip": 1.05170417, "balance_loss_mlp": 1.01873243, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 2.325845761087175, "language_loss": 0.64126492, "learning_rate": 2.9322572090093135e-06, "loss": 0.6628747, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.63631010055542 }, { "auxiliary_loss_clip": 0.01130314, "auxiliary_loss_mlp": 0.01027033, "balance_loss_clip": 1.04831934, "balance_loss_mlp": 1.01876545, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 3.005902018464999, "language_loss": 0.76386923, "learning_rate": 2.9315679688767713e-06, "loss": 0.78544271, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.6230359077453613 }, { "auxiliary_loss_clip": 0.01153348, "auxiliary_loss_mlp": 0.01027578, "balance_loss_clip": 1.05054092, "balance_loss_mlp": 1.01977623, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 1.9792315846174076, "language_loss": 0.6674186, "learning_rate": 2.9308785874261085e-06, "loss": 0.68922788, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 2.60276198387146 }, { "auxiliary_loss_clip": 0.01190613, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.05886972, "balance_loss_mlp": 1.02576697, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 1.739429228584804, "language_loss": 0.81576943, "learning_rate": 2.9301890647619045e-06, "loss": 0.83801568, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.564736843109131 }, { "auxiliary_loss_clip": 0.01166692, "auxiliary_loss_mlp": 0.01032765, "balance_loss_clip": 1.05669737, "balance_loss_mlp": 1.02407515, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 1.8394353699799733, "language_loss": 0.80657059, "learning_rate": 2.929499400988759e-06, "loss": 0.82856512, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.6268043518066406 }, { "auxiliary_loss_clip": 0.01177283, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.05973852, "balance_loss_mlp": 1.02149928, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 1.75933512841636, "language_loss": 0.65134609, "learning_rate": 2.9288095962112927e-06, "loss": 0.67342365, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.6216440200805664 }, { "auxiliary_loss_clip": 0.01188327, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.0578146, "balance_loss_mlp": 1.02167225, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 1.8625630710626468, "language_loss": 0.85039634, "learning_rate": 2.9281196505341503e-06, "loss": 0.87258565, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.4993529319763184 }, { "auxiliary_loss_clip": 0.0112771, "auxiliary_loss_mlp": 0.0076214, "balance_loss_clip": 1.0508734, "balance_loss_mlp": 1.00022769, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 1.9013913213161548, "language_loss": 0.78615409, "learning_rate": 2.9274295640619946e-06, "loss": 0.80505258, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.618086099624634 }, { "auxiliary_loss_clip": 0.01143418, "auxiliary_loss_mlp": 0.01030788, "balance_loss_clip": 1.04897046, "balance_loss_mlp": 1.0229795, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 2.0480004377426564, "language_loss": 0.78561926, "learning_rate": 2.9267393368995103e-06, "loss": 0.80736136, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.6518871784210205 }, { "auxiliary_loss_clip": 0.01190881, "auxiliary_loss_mlp": 0.01028666, "balance_loss_clip": 1.05841458, "balance_loss_mlp": 1.02002907, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 2.614841880720753, "language_loss": 0.74498087, "learning_rate": 2.926048969151407e-06, "loss": 0.76717639, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.4901578426361084 }, { "auxiliary_loss_clip": 0.01130427, "auxiliary_loss_mlp": 0.01027308, "balance_loss_clip": 1.05459142, "balance_loss_mlp": 1.01883209, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 1.6388577348152669, "language_loss": 0.68485755, "learning_rate": 2.92535846092241e-06, "loss": 0.70643491, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.66616153717041 }, { "auxiliary_loss_clip": 0.01161495, "auxiliary_loss_mlp": 0.01030697, "balance_loss_clip": 1.05368531, "balance_loss_mlp": 1.02194738, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 1.5452805478719565, "language_loss": 0.82730567, "learning_rate": 2.9246678123172704e-06, "loss": 0.84922755, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.605611801147461 }, { "auxiliary_loss_clip": 0.01191277, "auxiliary_loss_mlp": 0.01035257, "balance_loss_clip": 1.0582962, "balance_loss_mlp": 1.02623308, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 2.8671481837670036, "language_loss": 0.74635351, "learning_rate": 2.9239770234407596e-06, "loss": 0.76861882, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.503836154937744 }, { "auxiliary_loss_clip": 0.01175483, "auxiliary_loss_mlp": 0.01027363, "balance_loss_clip": 1.05520046, "balance_loss_mlp": 1.01811838, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 1.555021859311062, "language_loss": 0.67829776, "learning_rate": 2.9232860943976686e-06, "loss": 0.7003262, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.5863325595855713 }, { "auxiliary_loss_clip": 0.01161029, "auxiliary_loss_mlp": 0.01031372, "balance_loss_clip": 1.05432987, "balance_loss_mlp": 1.02385259, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.6205452753751888, "language_loss": 0.84367633, "learning_rate": 2.9225950252928115e-06, "loss": 0.86560035, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 3.423762083053589 }, { "auxiliary_loss_clip": 0.01177108, "auxiliary_loss_mlp": 0.01028949, "balance_loss_clip": 1.05907035, "balance_loss_mlp": 1.01981807, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 2.806793971641526, "language_loss": 0.8180486, "learning_rate": 2.9219038162310217e-06, "loss": 0.84010917, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.524834632873535 }, { "auxiliary_loss_clip": 0.01103214, "auxiliary_loss_mlp": 0.00762307, "balance_loss_clip": 1.04688001, "balance_loss_mlp": 1.00020778, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 1.7769446040248729, "language_loss": 0.82772493, "learning_rate": 2.921212467317157e-06, "loss": 0.84638011, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 2.71394944190979 }, { "auxiliary_loss_clip": 0.01141752, "auxiliary_loss_mlp": 0.01038292, "balance_loss_clip": 1.0466361, "balance_loss_mlp": 1.02902913, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 1.7683938955733205, "language_loss": 0.79741436, "learning_rate": 2.920520978656093e-06, "loss": 0.81921482, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 2.5482518672943115 }, { "auxiliary_loss_clip": 0.01188595, "auxiliary_loss_mlp": 0.00762093, "balance_loss_clip": 1.05794322, "balance_loss_mlp": 1.00021434, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 1.8278809278783392, "language_loss": 0.76687223, "learning_rate": 2.919829350352729e-06, "loss": 0.7863791, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 2.5884175300598145 }, { "auxiliary_loss_clip": 0.01082836, "auxiliary_loss_mlp": 0.0100887, "balance_loss_clip": 1.02306163, "balance_loss_mlp": 1.00726056, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.8191791098836778, "language_loss": 0.60094762, "learning_rate": 2.919137582511983e-06, "loss": 0.62186468, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 3.822255849838257 }, { "auxiliary_loss_clip": 0.01152975, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.05742431, "balance_loss_mlp": 1.02365518, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 1.9685655889550187, "language_loss": 0.64195728, "learning_rate": 2.918445675238797e-06, "loss": 0.66381061, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 3.3292181491851807 }, { "auxiliary_loss_clip": 0.01188056, "auxiliary_loss_mlp": 0.01030023, "balance_loss_clip": 1.0557456, "balance_loss_mlp": 1.02165508, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 1.9199615397620513, "language_loss": 0.69417918, "learning_rate": 2.917753628638132e-06, "loss": 0.71635997, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.565377712249756 }, { "auxiliary_loss_clip": 0.01162677, "auxiliary_loss_mlp": 0.01038534, "balance_loss_clip": 1.05530345, "balance_loss_mlp": 1.02993274, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 2.148561141928926, "language_loss": 0.70094121, "learning_rate": 2.9170614428149716e-06, "loss": 0.72295332, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 3.3701725006103516 }, { "auxiliary_loss_clip": 0.01143916, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.05254149, "balance_loss_mlp": 1.02533615, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 2.5323596018777814, "language_loss": 0.86748183, "learning_rate": 2.9163691178743195e-06, "loss": 0.88926882, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 2.6102726459503174 }, { "auxiliary_loss_clip": 0.01172401, "auxiliary_loss_mlp": 0.01029811, "balance_loss_clip": 1.05480886, "balance_loss_mlp": 1.02193129, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 2.2758273244429827, "language_loss": 0.77357745, "learning_rate": 2.9156766539212006e-06, "loss": 0.79559958, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 2.579263687133789 }, { "auxiliary_loss_clip": 0.01177576, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.05397654, "balance_loss_mlp": 1.0277319, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 2.075404749999038, "language_loss": 0.71951866, "learning_rate": 2.9149840510606614e-06, "loss": 0.74165535, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.5790624618530273 }, { "auxiliary_loss_clip": 0.01069476, "auxiliary_loss_mlp": 0.00752152, "balance_loss_clip": 1.02090478, "balance_loss_mlp": 1.00012386, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 1.0347194434592082, "language_loss": 0.64215702, "learning_rate": 2.914291309397769e-06, "loss": 0.66037327, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.246145248413086 }, { "auxiliary_loss_clip": 0.01106687, "auxiliary_loss_mlp": 0.01032249, "balance_loss_clip": 1.04713821, "balance_loss_mlp": 1.02320063, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 1.9023975164964235, "language_loss": 0.78268087, "learning_rate": 2.9135984290376117e-06, "loss": 0.80407023, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.6756486892700195 }, { "auxiliary_loss_clip": 0.01113001, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.04570687, "balance_loss_mlp": 1.02930677, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 2.011975139913776, "language_loss": 0.82539231, "learning_rate": 2.9129054100853e-06, "loss": 0.84689927, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 2.7358574867248535 }, { "auxiliary_loss_clip": 0.01159104, "auxiliary_loss_mlp": 0.01036792, "balance_loss_clip": 1.05203962, "balance_loss_mlp": 1.0278635, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.6223965101953781, "language_loss": 0.75827396, "learning_rate": 2.912212252645963e-06, "loss": 0.78023291, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.6248300075531006 }, { "auxiliary_loss_clip": 0.01178366, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.05385065, "balance_loss_mlp": 1.02538848, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 1.9277229167586838, "language_loss": 0.76767355, "learning_rate": 2.9115189568247523e-06, "loss": 0.78979963, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.585052490234375 }, { "auxiliary_loss_clip": 0.0112304, "auxiliary_loss_mlp": 0.01034124, "balance_loss_clip": 1.05406666, "balance_loss_mlp": 1.02585936, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 2.961541685481398, "language_loss": 0.9197461, "learning_rate": 2.910825522726841e-06, "loss": 0.94131768, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.6364240646362305 }, { "auxiliary_loss_clip": 0.01124674, "auxiliary_loss_mlp": 0.01035394, "balance_loss_clip": 1.0478214, "balance_loss_mlp": 1.02716875, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 1.8801250294984104, "language_loss": 0.77230316, "learning_rate": 2.9101319504574215e-06, "loss": 0.79390383, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.650235891342163 }, { "auxiliary_loss_clip": 0.01163101, "auxiliary_loss_mlp": 0.01023635, "balance_loss_clip": 1.05046046, "balance_loss_mlp": 1.01534951, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 2.0968095878394926, "language_loss": 0.76134789, "learning_rate": 2.909438240121709e-06, "loss": 0.78321517, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.5852949619293213 }, { "auxiliary_loss_clip": 0.01153484, "auxiliary_loss_mlp": 0.01035242, "balance_loss_clip": 1.05448127, "balance_loss_mlp": 1.02719545, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 1.624665255715946, "language_loss": 0.7051416, "learning_rate": 2.908744391824939e-06, "loss": 0.72702885, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.6355745792388916 }, { "auxiliary_loss_clip": 0.01117484, "auxiliary_loss_mlp": 0.01026109, "balance_loss_clip": 1.04616261, "balance_loss_mlp": 1.01765084, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 1.6971752209306938, "language_loss": 0.7891084, "learning_rate": 2.908050405672367e-06, "loss": 0.81054437, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.740341901779175 }, { "auxiliary_loss_clip": 0.0116394, "auxiliary_loss_mlp": 0.01027415, "balance_loss_clip": 1.05004096, "balance_loss_mlp": 1.01885569, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 1.702741025165438, "language_loss": 0.79109931, "learning_rate": 2.9073562817692703e-06, "loss": 0.81301296, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.620823860168457 }, { "auxiliary_loss_clip": 0.01045622, "auxiliary_loss_mlp": 0.01001565, "balance_loss_clip": 1.02275658, "balance_loss_mlp": 0.99986076, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7213313569317554, "language_loss": 0.56544518, "learning_rate": 2.9066620202209468e-06, "loss": 0.585917, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.2141683101654053 }, { "auxiliary_loss_clip": 0.01136179, "auxiliary_loss_mlp": 0.01026171, "balance_loss_clip": 1.05037737, "balance_loss_mlp": 1.01789784, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 1.8204161791523308, "language_loss": 0.77551782, "learning_rate": 2.905967621132716e-06, "loss": 0.79714131, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.659736394882202 }, { "auxiliary_loss_clip": 0.01158659, "auxiliary_loss_mlp": 0.01030346, "balance_loss_clip": 1.05056643, "balance_loss_mlp": 1.0214529, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 2.0423881445281955, "language_loss": 0.75529003, "learning_rate": 2.9052730846099172e-06, "loss": 0.77718008, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.614818572998047 }, { "auxiliary_loss_clip": 0.01059164, "auxiliary_loss_mlp": 0.01002919, "balance_loss_clip": 1.01953566, "balance_loss_mlp": 1.00122607, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8579539014062775, "language_loss": 0.6100443, "learning_rate": 2.9045784107579123e-06, "loss": 0.63066512, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 3.914754867553711 }, { "auxiliary_loss_clip": 0.01188831, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.05717099, "balance_loss_mlp": 1.02026725, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 1.8514519508278144, "language_loss": 0.66742313, "learning_rate": 2.9038835996820807e-06, "loss": 0.68958992, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.482332944869995 }, { "auxiliary_loss_clip": 0.01147922, "auxiliary_loss_mlp": 0.01040174, "balance_loss_clip": 1.0488236, "balance_loss_mlp": 1.03209162, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 1.6906608437094306, "language_loss": 0.79733455, "learning_rate": 2.903188651487826e-06, "loss": 0.81921548, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 2.6286144256591797 }, { "auxiliary_loss_clip": 0.01177393, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.05609608, "balance_loss_mlp": 1.0269115, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 2.026080146369784, "language_loss": 0.86605823, "learning_rate": 2.902493566280571e-06, "loss": 0.88818514, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 2.4914939403533936 }, { "auxiliary_loss_clip": 0.0115648, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.05311823, "balance_loss_mlp": 1.02411056, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 1.785586424285132, "language_loss": 0.80917335, "learning_rate": 2.9017983441657595e-06, "loss": 0.83106565, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.5235087871551514 }, { "auxiliary_loss_clip": 0.01131251, "auxiliary_loss_mlp": 0.01029464, "balance_loss_clip": 1.04671597, "balance_loss_mlp": 1.02097678, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 2.6531039938454777, "language_loss": 0.7503525, "learning_rate": 2.9011029852488564e-06, "loss": 0.77195966, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 3.3755571842193604 }, { "auxiliary_loss_clip": 0.01080276, "auxiliary_loss_mlp": 0.0100149, "balance_loss_clip": 1.02083635, "balance_loss_mlp": 0.99997008, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 0.9835191681181462, "language_loss": 0.62461263, "learning_rate": 2.9004074896353465e-06, "loss": 0.64543027, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 3.6774215698242188 }, { "auxiliary_loss_clip": 0.01186565, "auxiliary_loss_mlp": 0.01032013, "balance_loss_clip": 1.05795574, "balance_loss_mlp": 1.02381778, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 1.6649196559817716, "language_loss": 0.81604838, "learning_rate": 2.8997118574307362e-06, "loss": 0.83823419, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 3.300213098526001 }, { "auxiliary_loss_clip": 0.01150652, "auxiliary_loss_mlp": 0.01031741, "balance_loss_clip": 1.05212224, "balance_loss_mlp": 1.02330148, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 1.8370776807017108, "language_loss": 0.74297613, "learning_rate": 2.899016088740553e-06, "loss": 0.76480007, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.5594472885131836 }, { "auxiliary_loss_clip": 0.0113054, "auxiliary_loss_mlp": 0.01035854, "balance_loss_clip": 1.05115592, "balance_loss_mlp": 1.02755141, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 2.01558522223361, "language_loss": 0.79210949, "learning_rate": 2.898320183670344e-06, "loss": 0.81377345, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 2.606536626815796 }, { "auxiliary_loss_clip": 0.01130537, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.05345142, "balance_loss_mlp": 1.02880192, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 1.6283201077915095, "language_loss": 0.89131236, "learning_rate": 2.8976241423256767e-06, "loss": 0.9129886, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.6480982303619385 }, { "auxiliary_loss_clip": 0.01152222, "auxiliary_loss_mlp": 0.0103117, "balance_loss_clip": 1.05087292, "balance_loss_mlp": 1.02371931, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 3.5707084902204342, "language_loss": 0.67942512, "learning_rate": 2.896927964812142e-06, "loss": 0.70125902, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.6328155994415283 }, { "auxiliary_loss_clip": 0.0115504, "auxiliary_loss_mlp": 0.01030157, "balance_loss_clip": 1.05339503, "balance_loss_mlp": 1.02196133, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.301872782142269, "language_loss": 0.74872816, "learning_rate": 2.8962316512353465e-06, "loss": 0.77058011, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.5298497676849365 }, { "auxiliary_loss_clip": 0.01110719, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.04499722, "balance_loss_mlp": 1.02294111, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 1.492641239836418, "language_loss": 0.74721301, "learning_rate": 2.8955352017009233e-06, "loss": 0.76862949, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.6866841316223145 }, { "auxiliary_loss_clip": 0.01157479, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.05369103, "balance_loss_mlp": 1.02840972, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 1.7596695269494207, "language_loss": 0.77213919, "learning_rate": 2.8948386163145212e-06, "loss": 0.79408002, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.5968210697174072 }, { "auxiliary_loss_clip": 0.01175526, "auxiliary_loss_mlp": 0.01026011, "balance_loss_clip": 1.0554738, "balance_loss_mlp": 1.01829767, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 1.7400800800950047, "language_loss": 0.79439336, "learning_rate": 2.8941418951818135e-06, "loss": 0.81640869, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 2.5818326473236084 }, { "auxiliary_loss_clip": 0.0114578, "auxiliary_loss_mlp": 0.01026169, "balance_loss_clip": 1.04969275, "balance_loss_mlp": 1.01872683, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 2.0993453355913756, "language_loss": 0.71011704, "learning_rate": 2.8934450384084903e-06, "loss": 0.7318365, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.583446741104126 }, { "auxiliary_loss_clip": 0.01149581, "auxiliary_loss_mlp": 0.01030282, "balance_loss_clip": 1.05014801, "balance_loss_mlp": 1.02197325, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 2.0630284855135597, "language_loss": 0.69538569, "learning_rate": 2.8927480461002653e-06, "loss": 0.71718436, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.5924856662750244 }, { "auxiliary_loss_clip": 0.01154355, "auxiliary_loss_mlp": 0.01033382, "balance_loss_clip": 1.04799032, "balance_loss_mlp": 1.02464437, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 3.6124445772948035, "language_loss": 0.85957086, "learning_rate": 2.892050918362872e-06, "loss": 0.88144827, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.5162241458892822 }, { "auxiliary_loss_clip": 0.01009706, "auxiliary_loss_mlp": 0.01004555, "balance_loss_clip": 1.01421881, "balance_loss_mlp": 1.00317764, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 0.8444655002282905, "language_loss": 0.5600332, "learning_rate": 2.8913536553020626e-06, "loss": 0.58017582, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.3245577812194824 }, { "auxiliary_loss_clip": 0.01119395, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.04538298, "balance_loss_mlp": 1.0263443, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 1.7917782439039736, "language_loss": 0.85026723, "learning_rate": 2.8906562570236137e-06, "loss": 0.87180841, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 2.7825443744659424 }, { "auxiliary_loss_clip": 0.01112119, "auxiliary_loss_mlp": 0.01026784, "balance_loss_clip": 1.04340136, "balance_loss_mlp": 1.01937521, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 1.5166029996182928, "language_loss": 0.76629043, "learning_rate": 2.889958723633318e-06, "loss": 0.78767943, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.6565101146698 }, { "auxiliary_loss_clip": 0.01143699, "auxiliary_loss_mlp": 0.01032107, "balance_loss_clip": 1.0531491, "balance_loss_mlp": 1.0239526, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.6180833413597637, "language_loss": 0.73731536, "learning_rate": 2.889261055236992e-06, "loss": 0.75907344, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 3.058896064758301 }, { "auxiliary_loss_clip": 0.01157165, "auxiliary_loss_mlp": 0.01033167, "balance_loss_clip": 1.05396652, "balance_loss_mlp": 1.02491772, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 1.849492178690653, "language_loss": 0.82501423, "learning_rate": 2.8885632519404704e-06, "loss": 0.84691757, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.6088409423828125 }, { "auxiliary_loss_clip": 0.01156361, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.05325365, "balance_loss_mlp": 1.02203298, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 2.0051315284697098, "language_loss": 0.75411737, "learning_rate": 2.8878653138496107e-06, "loss": 0.77598155, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.62419056892395 }, { "auxiliary_loss_clip": 0.01111724, "auxiliary_loss_mlp": 0.01029633, "balance_loss_clip": 1.04370189, "balance_loss_mlp": 1.02160072, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 2.212292725787708, "language_loss": 0.76913017, "learning_rate": 2.8871672410702878e-06, "loss": 0.79054368, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.671839714050293 }, { "auxiliary_loss_clip": 0.01148695, "auxiliary_loss_mlp": 0.01028559, "balance_loss_clip": 1.04836142, "balance_loss_mlp": 1.01965976, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 1.6788766857892055, "language_loss": 0.81964219, "learning_rate": 2.8864690337084008e-06, "loss": 0.84141469, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 3.5025062561035156 }, { "auxiliary_loss_clip": 0.01165877, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.05122685, "balance_loss_mlp": 1.02755868, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 1.9100694158133065, "language_loss": 0.77847707, "learning_rate": 2.885770691869866e-06, "loss": 0.80049175, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.567470073699951 }, { "auxiliary_loss_clip": 0.01167675, "auxiliary_loss_mlp": 0.01033682, "balance_loss_clip": 1.05389857, "balance_loss_mlp": 1.02549219, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 2.224968857459263, "language_loss": 0.74490809, "learning_rate": 2.8850722156606207e-06, "loss": 0.76692164, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 2.5556812286376953 }, { "auxiliary_loss_clip": 0.01162036, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.04940033, "balance_loss_mlp": 1.02794611, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.6230341502152954, "language_loss": 0.66967607, "learning_rate": 2.8843736051866252e-06, "loss": 0.69165277, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 2.5148329734802246 }, { "auxiliary_loss_clip": 0.01126453, "auxiliary_loss_mlp": 0.00761312, "balance_loss_clip": 1.04712462, "balance_loss_mlp": 1.00029409, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 1.519579327745257, "language_loss": 0.69546807, "learning_rate": 2.8836748605538557e-06, "loss": 0.71434569, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.670302629470825 }, { "auxiliary_loss_clip": 0.01161312, "auxiliary_loss_mlp": 0.01028925, "balance_loss_clip": 1.05257463, "balance_loss_mlp": 1.02043176, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 1.961512812219949, "language_loss": 0.63319892, "learning_rate": 2.882975981868313e-06, "loss": 0.6551013, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 4.175340414047241 }, { "auxiliary_loss_clip": 0.01168867, "auxiliary_loss_mlp": 0.01033556, "balance_loss_clip": 1.05434895, "balance_loss_mlp": 1.02533686, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 2.7617806843406574, "language_loss": 0.68699944, "learning_rate": 2.882276969236016e-06, "loss": 0.70902371, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 3.542484760284424 }, { "auxiliary_loss_clip": 0.01152661, "auxiliary_loss_mlp": 0.01027832, "balance_loss_clip": 1.05029416, "balance_loss_mlp": 1.01970768, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 2.0019563037806583, "language_loss": 0.76665866, "learning_rate": 2.881577822763005e-06, "loss": 0.78846359, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 2.5261940956115723 }, { "auxiliary_loss_clip": 0.01168773, "auxiliary_loss_mlp": 0.01029592, "balance_loss_clip": 1.05223024, "balance_loss_mlp": 1.02170062, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 1.7818054056493005, "language_loss": 0.87562221, "learning_rate": 2.880878542555338e-06, "loss": 0.89760584, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 2.550283670425415 }, { "auxiliary_loss_clip": 0.011835, "auxiliary_loss_mlp": 0.01039905, "balance_loss_clip": 1.05292261, "balance_loss_mlp": 1.03148293, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 1.9655739173930757, "language_loss": 0.80952716, "learning_rate": 2.8801791287190976e-06, "loss": 0.83176112, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 2.4860949516296387 }, { "auxiliary_loss_clip": 0.01171419, "auxiliary_loss_mlp": 0.01029833, "balance_loss_clip": 1.05272532, "balance_loss_mlp": 1.02154779, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 2.5747255725972837, "language_loss": 0.86600751, "learning_rate": 2.8794795813603817e-06, "loss": 0.88802004, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.529881000518799 }, { "auxiliary_loss_clip": 0.01173194, "auxiliary_loss_mlp": 0.01035831, "balance_loss_clip": 1.05213451, "balance_loss_mlp": 1.02775419, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 1.6795689400736182, "language_loss": 0.81589979, "learning_rate": 2.878779900585314e-06, "loss": 0.83799005, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 2.4610259532928467 }, { "auxiliary_loss_clip": 0.01162878, "auxiliary_loss_mlp": 0.01030351, "balance_loss_clip": 1.05343223, "balance_loss_mlp": 1.02321661, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.6029659177132476, "language_loss": 0.754053, "learning_rate": 2.8780800865000336e-06, "loss": 0.7759853, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.5944268703460693 }, { "auxiliary_loss_clip": 0.010672, "auxiliary_loss_mlp": 0.01003033, "balance_loss_clip": 1.0180819, "balance_loss_mlp": 1.00154924, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.9762708163580068, "language_loss": 0.59219861, "learning_rate": 2.877380139210702e-06, "loss": 0.61290085, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.058168411254883 }, { "auxiliary_loss_clip": 0.01143135, "auxiliary_loss_mlp": 0.01029084, "balance_loss_clip": 1.05072784, "balance_loss_mlp": 1.02097225, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 1.6924516172608204, "language_loss": 0.7626003, "learning_rate": 2.876680058823501e-06, "loss": 0.7843225, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.5937039852142334 }, { "auxiliary_loss_clip": 0.01146646, "auxiliary_loss_mlp": 0.01026606, "balance_loss_clip": 1.04993105, "balance_loss_mlp": 1.01737356, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 1.8492100168528574, "language_loss": 0.66320944, "learning_rate": 2.8759798454446314e-06, "loss": 0.68494201, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 2.627420425415039 }, { "auxiliary_loss_clip": 0.01173238, "auxiliary_loss_mlp": 0.01030997, "balance_loss_clip": 1.05419493, "balance_loss_mlp": 1.02276576, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 1.7829956234710356, "language_loss": 0.81399703, "learning_rate": 2.8752794991803173e-06, "loss": 0.83603942, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.532078504562378 }, { "auxiliary_loss_clip": 0.01156024, "auxiliary_loss_mlp": 0.01031941, "balance_loss_clip": 1.05477691, "balance_loss_mlp": 1.02467489, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 1.8337930466584658, "language_loss": 0.75310308, "learning_rate": 2.8745790201367976e-06, "loss": 0.77498275, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.5081212520599365 }, { "auxiliary_loss_clip": 0.01186766, "auxiliary_loss_mlp": 0.01031347, "balance_loss_clip": 1.05467927, "balance_loss_mlp": 1.02240062, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 1.9795001321221324, "language_loss": 0.84083986, "learning_rate": 2.8738784084203373e-06, "loss": 0.86302096, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.533269166946411 }, { "auxiliary_loss_clip": 0.01149735, "auxiliary_loss_mlp": 0.01029931, "balance_loss_clip": 1.04930878, "balance_loss_mlp": 1.02229559, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 1.714503921975435, "language_loss": 0.78795302, "learning_rate": 2.873177664137216e-06, "loss": 0.80974972, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.57914400100708 }, { "auxiliary_loss_clip": 0.01138866, "auxiliary_loss_mlp": 0.01025507, "balance_loss_clip": 1.05187714, "balance_loss_mlp": 1.01778793, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 2.233959996422891, "language_loss": 0.69058216, "learning_rate": 2.8724767873937384e-06, "loss": 0.71222591, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.6755900382995605 }, { "auxiliary_loss_clip": 0.01155075, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.05157638, "balance_loss_mlp": 1.0170821, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 1.9348775240610623, "language_loss": 0.87052363, "learning_rate": 2.871775778296225e-06, "loss": 0.89232242, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.540315628051758 }, { "auxiliary_loss_clip": 0.01171018, "auxiliary_loss_mlp": 0.01028914, "balance_loss_clip": 1.05548894, "balance_loss_mlp": 1.02074194, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 2.045301713367826, "language_loss": 0.78424966, "learning_rate": 2.8710746369510196e-06, "loss": 0.80624896, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.526965856552124 }, { "auxiliary_loss_clip": 0.01151032, "auxiliary_loss_mlp": 0.01033561, "balance_loss_clip": 1.0509851, "balance_loss_mlp": 1.0252465, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 2.3746909476439964, "language_loss": 0.83908075, "learning_rate": 2.8703733634644846e-06, "loss": 0.86092669, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.510077953338623 }, { "auxiliary_loss_clip": 0.01184159, "auxiliary_loss_mlp": 0.01024949, "balance_loss_clip": 1.05629849, "balance_loss_mlp": 1.01733708, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 1.6197542842097143, "language_loss": 0.78890401, "learning_rate": 2.869671957943002e-06, "loss": 0.81099504, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.5099806785583496 }, { "auxiliary_loss_clip": 0.01154343, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.05813646, "balance_loss_mlp": 1.0283339, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 1.7836061818240865, "language_loss": 0.74326438, "learning_rate": 2.8689704204929747e-06, "loss": 0.76516855, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.5851807594299316 }, { "auxiliary_loss_clip": 0.01186337, "auxiliary_loss_mlp": 0.01034023, "balance_loss_clip": 1.05537009, "balance_loss_mlp": 1.02568424, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 11.064369512615885, "language_loss": 0.81097388, "learning_rate": 2.8682687512208253e-06, "loss": 0.83317745, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 3.2288777828216553 }, { "auxiliary_loss_clip": 0.01177207, "auxiliary_loss_mlp": 0.01031081, "balance_loss_clip": 1.05496669, "balance_loss_mlp": 1.0233686, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 2.0343474064386537, "language_loss": 0.80458796, "learning_rate": 2.8675669502329972e-06, "loss": 0.82667077, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.5694735050201416 }, { "auxiliary_loss_clip": 0.01166704, "auxiliary_loss_mlp": 0.00761321, "balance_loss_clip": 1.05205703, "balance_loss_mlp": 1.00027251, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.343843029903865, "language_loss": 0.86289328, "learning_rate": 2.866865017635952e-06, "loss": 0.88217354, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 2.5360028743743896 }, { "auxiliary_loss_clip": 0.01139843, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.05374122, "balance_loss_mlp": 1.02442348, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 1.554909453421561, "language_loss": 0.7903167, "learning_rate": 2.866162953536174e-06, "loss": 0.81204021, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 2.617272138595581 }, { "auxiliary_loss_clip": 0.01154852, "auxiliary_loss_mlp": 0.00761359, "balance_loss_clip": 1.05102825, "balance_loss_mlp": 1.00031066, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.6378832987875838, "language_loss": 0.75217307, "learning_rate": 2.8654607580401634e-06, "loss": 0.77133518, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.568263292312622 }, { "auxiliary_loss_clip": 0.0106874, "auxiliary_loss_mlp": 0.01011893, "balance_loss_clip": 1.0207119, "balance_loss_mlp": 1.01042068, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.8797373320484281, "language_loss": 0.65255582, "learning_rate": 2.8647584312544446e-06, "loss": 0.67336214, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.9317147731781006 }, { "auxiliary_loss_clip": 0.01135453, "auxiliary_loss_mlp": 0.00760963, "balance_loss_clip": 1.04708123, "balance_loss_mlp": 1.00028825, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 1.3767434421482694, "language_loss": 0.85220915, "learning_rate": 2.864055973285559e-06, "loss": 0.87117332, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 3.4891505241394043 }, { "auxiliary_loss_clip": 0.01144211, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.04866123, "balance_loss_mlp": 1.02457094, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.8008835842251847, "language_loss": 0.86144984, "learning_rate": 2.8633533842400698e-06, "loss": 0.88321561, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 2.5921103954315186 }, { "auxiliary_loss_clip": 0.01174223, "auxiliary_loss_mlp": 0.00762225, "balance_loss_clip": 1.05582714, "balance_loss_mlp": 1.00029254, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.7841484916481138, "language_loss": 0.77574736, "learning_rate": 2.862650664224558e-06, "loss": 0.79511189, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 2.5838611125946045 }, { "auxiliary_loss_clip": 0.01171531, "auxiliary_loss_mlp": 0.01031878, "balance_loss_clip": 1.05891132, "balance_loss_mlp": 1.02420425, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.4905736616524992, "language_loss": 0.69560575, "learning_rate": 2.861947813345627e-06, "loss": 0.7176398, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 2.7531089782714844 }, { "auxiliary_loss_clip": 0.01186575, "auxiliary_loss_mlp": 0.00761804, "balance_loss_clip": 1.05543816, "balance_loss_mlp": 1.00029206, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 2.4100280901583724, "language_loss": 0.72835028, "learning_rate": 2.8612448317098974e-06, "loss": 0.74783409, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 2.6196014881134033 }, { "auxiliary_loss_clip": 0.01145621, "auxiliary_loss_mlp": 0.00762222, "balance_loss_clip": 1.05089688, "balance_loss_mlp": 1.00024819, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 2.092315608501026, "language_loss": 0.82962966, "learning_rate": 2.8605417194240114e-06, "loss": 0.84870809, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.6539828777313232 }, { "auxiliary_loss_clip": 0.01165856, "auxiliary_loss_mlp": 0.01023981, "balance_loss_clip": 1.0528605, "balance_loss_mlp": 1.01630998, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 1.743738776926079, "language_loss": 0.78767085, "learning_rate": 2.8598384765946315e-06, "loss": 0.80956924, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.6098129749298096 }, { "auxiliary_loss_clip": 0.01181392, "auxiliary_loss_mlp": 0.01022926, "balance_loss_clip": 1.05155158, "balance_loss_mlp": 1.01551092, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 1.7569908574497826, "language_loss": 0.7130307, "learning_rate": 2.8591351033284377e-06, "loss": 0.7350738, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 2.5902416706085205 }, { "auxiliary_loss_clip": 0.01170619, "auxiliary_loss_mlp": 0.01024386, "balance_loss_clip": 1.05182374, "balance_loss_mlp": 1.01644707, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 1.990746806410882, "language_loss": 0.83823824, "learning_rate": 2.8584315997321325e-06, "loss": 0.86018836, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.5905869007110596 }, { "auxiliary_loss_clip": 0.01185299, "auxiliary_loss_mlp": 0.0103195, "balance_loss_clip": 1.055444, "balance_loss_mlp": 1.02391505, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 2.2653390669488354, "language_loss": 0.7767424, "learning_rate": 2.8577279659124356e-06, "loss": 0.79891491, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 2.5280532836914062 }, { "auxiliary_loss_clip": 0.01167693, "auxiliary_loss_mlp": 0.01027801, "balance_loss_clip": 1.05249465, "balance_loss_mlp": 1.02066064, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 1.7749965364269122, "language_loss": 0.83329207, "learning_rate": 2.857024201976089e-06, "loss": 0.85524702, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.5365240573883057 }, { "auxiliary_loss_clip": 0.01151806, "auxiliary_loss_mlp": 0.01026084, "balance_loss_clip": 1.05099189, "balance_loss_mlp": 1.01743531, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 1.889390990166829, "language_loss": 0.72530037, "learning_rate": 2.8563203080298516e-06, "loss": 0.74707925, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.6815083026885986 }, { "auxiliary_loss_clip": 0.01156595, "auxiliary_loss_mlp": 0.00761299, "balance_loss_clip": 1.05404425, "balance_loss_mlp": 1.0002352, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 2.0786215497717646, "language_loss": 0.89359266, "learning_rate": 2.855616284180505e-06, "loss": 0.91277152, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.553753137588501 }, { "auxiliary_loss_clip": 0.01068229, "auxiliary_loss_mlp": 0.01003789, "balance_loss_clip": 1.01840603, "balance_loss_mlp": 1.0024178, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 0.8774801176984057, "language_loss": 0.66142684, "learning_rate": 2.8549121305348477e-06, "loss": 0.68214703, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.1097583770751953 }, { "auxiliary_loss_clip": 0.0117071, "auxiliary_loss_mlp": 0.01025978, "balance_loss_clip": 1.05391324, "balance_loss_mlp": 1.01875949, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 2.524443136308612, "language_loss": 0.83408594, "learning_rate": 2.8542078471997006e-06, "loss": 0.85605282, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.557471752166748 }, { "auxiliary_loss_clip": 0.01172112, "auxiliary_loss_mlp": 0.01026567, "balance_loss_clip": 1.05404723, "balance_loss_mlp": 1.01880097, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 1.7804519151607685, "language_loss": 0.75692034, "learning_rate": 2.8535034342819013e-06, "loss": 0.77890718, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.5591392517089844 }, { "auxiliary_loss_clip": 0.01182957, "auxiliary_loss_mlp": 0.0102993, "balance_loss_clip": 1.05559611, "balance_loss_mlp": 1.02227664, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 1.7389360927010975, "language_loss": 0.72433835, "learning_rate": 2.85279889188831e-06, "loss": 0.74646717, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.5101706981658936 }, { "auxiliary_loss_clip": 0.01136632, "auxiliary_loss_mlp": 0.01029365, "balance_loss_clip": 1.04517746, "balance_loss_mlp": 1.02178943, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 1.8769795387577275, "language_loss": 0.81059456, "learning_rate": 2.852094220125805e-06, "loss": 0.83225453, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.642052173614502 }, { "auxiliary_loss_clip": 0.01171219, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.05546272, "balance_loss_mlp": 1.02172875, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 2.05260689234353, "language_loss": 0.71095759, "learning_rate": 2.8513894191012846e-06, "loss": 0.73296577, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.537792682647705 }, { "auxiliary_loss_clip": 0.01184139, "auxiliary_loss_mlp": 0.01032792, "balance_loss_clip": 1.05467153, "balance_loss_mlp": 1.02552056, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.562213547718513, "language_loss": 0.78750598, "learning_rate": 2.8506844889216664e-06, "loss": 0.80967534, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.525606155395508 }, { "auxiliary_loss_clip": 0.01077716, "auxiliary_loss_mlp": 0.01009999, "balance_loss_clip": 1.03342938, "balance_loss_mlp": 1.00852704, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8701397599867261, "language_loss": 0.62858891, "learning_rate": 2.849979429693887e-06, "loss": 0.64946616, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.936729907989502 }, { "auxiliary_loss_clip": 0.01182027, "auxiliary_loss_mlp": 0.01031431, "balance_loss_clip": 1.05470777, "balance_loss_mlp": 1.02356887, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 2.1041560094794707, "language_loss": 0.74110687, "learning_rate": 2.8492742415249042e-06, "loss": 0.76324141, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.471731662750244 }, { "auxiliary_loss_clip": 0.01180051, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.05170882, "balance_loss_mlp": 1.02069926, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.557968926099854, "language_loss": 0.76303136, "learning_rate": 2.848568924521694e-06, "loss": 0.78511685, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 2.542872905731201 }, { "auxiliary_loss_clip": 0.01161862, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.04961884, "balance_loss_mlp": 1.0278914, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 2.2822294027474612, "language_loss": 0.73342919, "learning_rate": 2.8478634787912526e-06, "loss": 0.75540233, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 2.5565409660339355 }, { "auxiliary_loss_clip": 0.01165177, "auxiliary_loss_mlp": 0.01031415, "balance_loss_clip": 1.0499506, "balance_loss_mlp": 1.02365148, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 3.5289695506326577, "language_loss": 0.77036244, "learning_rate": 2.847157904440596e-06, "loss": 0.79232836, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.5633418560028076 }, { "auxiliary_loss_clip": 0.0116577, "auxiliary_loss_mlp": 0.01028591, "balance_loss_clip": 1.05259085, "balance_loss_mlp": 1.02106929, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 1.5344910303159796, "language_loss": 0.73976779, "learning_rate": 2.846452201576759e-06, "loss": 0.76171136, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 3.34928297996521 }, { "auxiliary_loss_clip": 0.01057928, "auxiliary_loss_mlp": 0.01007916, "balance_loss_clip": 1.01784778, "balance_loss_mlp": 1.00636625, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.8410393461646315, "language_loss": 0.62780994, "learning_rate": 2.845746370306795e-06, "loss": 0.64846843, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 4.092927932739258 }, { "auxiliary_loss_clip": 0.01167163, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.05101895, "balance_loss_mlp": 1.0214355, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 2.008761971764891, "language_loss": 0.78544438, "learning_rate": 2.84504041073778e-06, "loss": 0.8074013, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 2.5568647384643555 }, { "auxiliary_loss_clip": 0.01148688, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.05163682, "balance_loss_mlp": 1.02665091, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.692562232449905, "language_loss": 0.79089928, "learning_rate": 2.844334322976806e-06, "loss": 0.81273335, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 2.578557014465332 }, { "auxiliary_loss_clip": 0.0112749, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.04561555, "balance_loss_mlp": 1.01998568, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 1.9973479501681453, "language_loss": 0.83172077, "learning_rate": 2.8436281071309866e-06, "loss": 0.85328132, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 2.6526529788970947 }, { "auxiliary_loss_clip": 0.01035761, "auxiliary_loss_mlp": 0.01011579, "balance_loss_clip": 1.01473498, "balance_loss_mlp": 1.01007128, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.7328859531460182, "language_loss": 0.52987236, "learning_rate": 2.842921763307455e-06, "loss": 0.55034578, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.266592502593994 }, { "auxiliary_loss_clip": 0.01145744, "auxiliary_loss_mlp": 0.01027574, "balance_loss_clip": 1.04730248, "balance_loss_mlp": 1.01999271, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 2.3996404307369423, "language_loss": 0.82391667, "learning_rate": 2.842215291613361e-06, "loss": 0.84564984, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.7710354328155518 }, { "auxiliary_loss_clip": 0.01001178, "auxiliary_loss_mlp": 0.01010946, "balance_loss_clip": 1.0139327, "balance_loss_mlp": 1.00921714, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.8232133339993828, "language_loss": 0.59262639, "learning_rate": 2.8415086921558774e-06, "loss": 0.61274767, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.3661868572235107 }, { "auxiliary_loss_clip": 0.01138537, "auxiliary_loss_mlp": 0.01024192, "balance_loss_clip": 1.04443073, "balance_loss_mlp": 1.01689017, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.5857258567404975, "language_loss": 0.78783059, "learning_rate": 2.840801965042194e-06, "loss": 0.8094579, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 2.9885551929473877 }, { "auxiliary_loss_clip": 0.01143365, "auxiliary_loss_mlp": 0.01026002, "balance_loss_clip": 1.04579723, "balance_loss_mlp": 1.01838446, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 1.7506435320198142, "language_loss": 0.84105092, "learning_rate": 2.840095110379521e-06, "loss": 0.86274451, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.6298749446868896 }, { "auxiliary_loss_clip": 0.01028779, "auxiliary_loss_mlp": 0.0100342, "balance_loss_clip": 1.01491737, "balance_loss_mlp": 1.0021143, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.726946192976291, "language_loss": 0.53961122, "learning_rate": 2.8393881282750884e-06, "loss": 0.55993319, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 3.1572351455688477 }, { "auxiliary_loss_clip": 0.01150418, "auxiliary_loss_mlp": 0.01032318, "balance_loss_clip": 1.05018103, "balance_loss_mlp": 1.02405095, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 1.8497110440144855, "language_loss": 0.79083824, "learning_rate": 2.838681018836144e-06, "loss": 0.81266558, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.5744926929473877 }, { "auxiliary_loss_clip": 0.01140184, "auxiliary_loss_mlp": 0.00760483, "balance_loss_clip": 1.04757977, "balance_loss_mlp": 1.00022674, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 4.5010796943094435, "language_loss": 0.78067625, "learning_rate": 2.837973782169955e-06, "loss": 0.79968286, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.5888988971710205 }, { "auxiliary_loss_clip": 0.01073525, "auxiliary_loss_mlp": 0.01001866, "balance_loss_clip": 1.01537478, "balance_loss_mlp": 1.00045967, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.8058867931929685, "language_loss": 0.59184951, "learning_rate": 2.8372664183838096e-06, "loss": 0.61260343, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.1438207626342773 }, { "auxiliary_loss_clip": 0.0118087, "auxiliary_loss_mlp": 0.01030365, "balance_loss_clip": 1.05410421, "balance_loss_mlp": 1.0227474, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.2981575972290815, "language_loss": 0.68607616, "learning_rate": 2.836558927585015e-06, "loss": 0.70818853, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.5110583305358887 }, { "auxiliary_loss_clip": 0.01169841, "auxiliary_loss_mlp": 0.01032639, "balance_loss_clip": 1.05263937, "balance_loss_mlp": 1.02492046, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 1.741719237994178, "language_loss": 0.82579315, "learning_rate": 2.8358513098808957e-06, "loss": 0.84781796, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.563683271408081 }, { "auxiliary_loss_clip": 0.01114822, "auxiliary_loss_mlp": 0.01029367, "balance_loss_clip": 1.04664588, "balance_loss_mlp": 1.02146983, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 1.7075545654893276, "language_loss": 0.76702464, "learning_rate": 2.835143565378798e-06, "loss": 0.78846645, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.6704649925231934 }, { "auxiliary_loss_clip": 0.0110571, "auxiliary_loss_mlp": 0.01025597, "balance_loss_clip": 1.04483175, "balance_loss_mlp": 1.01767302, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 1.770877751482208, "language_loss": 0.78048712, "learning_rate": 2.8344356941860847e-06, "loss": 0.80180019, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.68534779548645 }, { "auxiliary_loss_clip": 0.01137199, "auxiliary_loss_mlp": 0.01033743, "balance_loss_clip": 1.05034316, "balance_loss_mlp": 1.0259943, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.018592747069429, "language_loss": 0.65893912, "learning_rate": 2.8337276964101403e-06, "loss": 0.68064857, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.734175682067871 }, { "auxiliary_loss_clip": 0.01171084, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.05352879, "balance_loss_mlp": 1.02695203, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 1.961656341265008, "language_loss": 0.76465887, "learning_rate": 2.833019572158367e-06, "loss": 0.78671646, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.5567846298217773 }, { "auxiliary_loss_clip": 0.01153843, "auxiliary_loss_mlp": 0.01028496, "balance_loss_clip": 1.05212271, "balance_loss_mlp": 1.02107823, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 1.6560811282426453, "language_loss": 0.80065393, "learning_rate": 2.8323113215381872e-06, "loss": 0.82247734, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 3.447103261947632 }, { "auxiliary_loss_clip": 0.01139709, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.04897213, "balance_loss_mlp": 1.02495182, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 2.2063295582998776, "language_loss": 0.76183581, "learning_rate": 2.831602944657042e-06, "loss": 0.78356689, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.614678382873535 }, { "auxiliary_loss_clip": 0.01158063, "auxiliary_loss_mlp": 0.01031475, "balance_loss_clip": 1.05024314, "balance_loss_mlp": 1.02379775, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 4.2601465907195495, "language_loss": 0.74315912, "learning_rate": 2.830894441622391e-06, "loss": 0.76505446, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.627392053604126 }, { "auxiliary_loss_clip": 0.01134288, "auxiliary_loss_mlp": 0.00761488, "balance_loss_clip": 1.04383349, "balance_loss_mlp": 1.00021768, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 1.8478201634458247, "language_loss": 0.79997826, "learning_rate": 2.8301858125417134e-06, "loss": 0.81893599, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 2.6682589054107666 }, { "auxiliary_loss_clip": 0.01153812, "auxiliary_loss_mlp": 0.01025583, "balance_loss_clip": 1.05209708, "balance_loss_mlp": 1.01850498, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.7050549723848305, "language_loss": 0.74028885, "learning_rate": 2.8294770575225082e-06, "loss": 0.76208282, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 2.5887250900268555 }, { "auxiliary_loss_clip": 0.01169411, "auxiliary_loss_mlp": 0.01038444, "balance_loss_clip": 1.05421352, "balance_loss_mlp": 1.03052258, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.7173340416909162, "language_loss": 0.84119129, "learning_rate": 2.828768176672293e-06, "loss": 0.86326981, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 3.3614981174468994 }, { "auxiliary_loss_clip": 0.01136011, "auxiliary_loss_mlp": 0.01040471, "balance_loss_clip": 1.04660833, "balance_loss_mlp": 1.0320425, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 1.720422409927139, "language_loss": 0.71573353, "learning_rate": 2.8280591700986044e-06, "loss": 0.73749828, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 4.3570005893707275 }, { "auxiliary_loss_clip": 0.01156623, "auxiliary_loss_mlp": 0.01030804, "balance_loss_clip": 1.04885459, "balance_loss_mlp": 1.02306104, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 6.630667151384367, "language_loss": 0.74690795, "learning_rate": 2.827350037908999e-06, "loss": 0.7687822, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.69952654838562 }, { "auxiliary_loss_clip": 0.01146119, "auxiliary_loss_mlp": 0.01029759, "balance_loss_clip": 1.04970288, "balance_loss_mlp": 1.02148008, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 2.051121564887161, "language_loss": 0.79186571, "learning_rate": 2.8266407802110496e-06, "loss": 0.81362444, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 2.5902163982391357 }, { "auxiliary_loss_clip": 0.01102982, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.0441494, "balance_loss_mlp": 1.0311625, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 2.259900841120351, "language_loss": 0.7592839, "learning_rate": 2.8259313971123515e-06, "loss": 0.78070486, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 2.743882894515991 }, { "auxiliary_loss_clip": 0.01169456, "auxiliary_loss_mlp": 0.0102449, "balance_loss_clip": 1.05671239, "balance_loss_mlp": 1.01698017, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.5002690176233273, "language_loss": 0.78207505, "learning_rate": 2.8252218887205166e-06, "loss": 0.8040145, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 2.5763626098632812 }, { "auxiliary_loss_clip": 0.01116226, "auxiliary_loss_mlp": 0.01033513, "balance_loss_clip": 1.04848003, "balance_loss_mlp": 1.02579975, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 1.7150154740770813, "language_loss": 0.80969393, "learning_rate": 2.824512255143178e-06, "loss": 0.8311913, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.7162322998046875 }, { "auxiliary_loss_clip": 0.01142962, "auxiliary_loss_mlp": 0.01026173, "balance_loss_clip": 1.04978836, "balance_loss_mlp": 1.0182333, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 1.6540776847310479, "language_loss": 0.79402709, "learning_rate": 2.8238024964879855e-06, "loss": 0.81571841, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 2.601266384124756 }, { "auxiliary_loss_clip": 0.01185461, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.05555105, "balance_loss_mlp": 1.02059126, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.616518730027013, "language_loss": 0.76894891, "learning_rate": 2.8230926128626095e-06, "loss": 0.79108661, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.513286828994751 }, { "auxiliary_loss_clip": 0.01147183, "auxiliary_loss_mlp": 0.01028218, "balance_loss_clip": 1.04752803, "balance_loss_mlp": 1.02023101, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 2.0993437529251673, "language_loss": 0.79327828, "learning_rate": 2.822382604374738e-06, "loss": 0.8150323, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.575206756591797 }, { "auxiliary_loss_clip": 0.01155034, "auxiliary_loss_mlp": 0.01030909, "balance_loss_clip": 1.05496407, "balance_loss_mlp": 1.02264762, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 2.3246083327926605, "language_loss": 0.6538856, "learning_rate": 2.8216724711320793e-06, "loss": 0.67574501, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 2.6148626804351807 }, { "auxiliary_loss_clip": 0.01180301, "auxiliary_loss_mlp": 0.00760917, "balance_loss_clip": 1.05277216, "balance_loss_mlp": 1.00021362, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 3.723461568977277, "language_loss": 0.79567111, "learning_rate": 2.820962213242361e-06, "loss": 0.81508327, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 2.5601460933685303 }, { "auxiliary_loss_clip": 0.01164267, "auxiliary_loss_mlp": 0.0103345, "balance_loss_clip": 1.05300593, "balance_loss_mlp": 1.02567172, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 2.346647241130003, "language_loss": 0.83936942, "learning_rate": 2.8202518308133264e-06, "loss": 0.86134648, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.6107635498046875 }, { "auxiliary_loss_clip": 0.01185154, "auxiliary_loss_mlp": 0.01030303, "balance_loss_clip": 1.05418682, "balance_loss_mlp": 1.02210116, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 1.8170567036206642, "language_loss": 0.73287165, "learning_rate": 2.8195413239527426e-06, "loss": 0.75502616, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.535554885864258 }, { "auxiliary_loss_clip": 0.0116592, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.05254185, "balance_loss_mlp": 1.02111447, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 1.8280198223192647, "language_loss": 0.80225551, "learning_rate": 2.8188306927683906e-06, "loss": 0.82420135, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.5393307209014893 }, { "auxiliary_loss_clip": 0.01155892, "auxiliary_loss_mlp": 0.01030924, "balance_loss_clip": 1.05300486, "balance_loss_mlp": 1.02335417, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 1.8564501128206192, "language_loss": 0.7456764, "learning_rate": 2.818119937368074e-06, "loss": 0.76754463, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.580806255340576 }, { "auxiliary_loss_clip": 0.01173078, "auxiliary_loss_mlp": 0.01032357, "balance_loss_clip": 1.05266225, "balance_loss_mlp": 1.02407229, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 1.897645435154649, "language_loss": 0.65535355, "learning_rate": 2.817409057859613e-06, "loss": 0.67740786, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.560251474380493 }, { "auxiliary_loss_clip": 0.01122366, "auxiliary_loss_mlp": 0.01034227, "balance_loss_clip": 1.04767954, "balance_loss_mlp": 1.02495873, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 2.0679698832971645, "language_loss": 0.79169786, "learning_rate": 2.8166980543508482e-06, "loss": 0.81326377, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.6013872623443604 }, { "auxiliary_loss_clip": 0.01189083, "auxiliary_loss_mlp": 0.01033587, "balance_loss_clip": 1.05870807, "balance_loss_mlp": 1.02535534, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 1.70447955417919, "language_loss": 0.79579139, "learning_rate": 2.815986926949638e-06, "loss": 0.81801808, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.5315709114074707 }, { "auxiliary_loss_clip": 0.01168676, "auxiliary_loss_mlp": 0.01033028, "balance_loss_clip": 1.05433273, "balance_loss_mlp": 1.02514839, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 1.7709256279851064, "language_loss": 0.8012827, "learning_rate": 2.8152756757638597e-06, "loss": 0.82329977, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.5194461345672607 }, { "auxiliary_loss_clip": 0.01166757, "auxiliary_loss_mlp": 0.01030373, "balance_loss_clip": 1.05370855, "balance_loss_mlp": 1.02237988, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 1.9795042599300814, "language_loss": 0.84469151, "learning_rate": 2.8145643009014093e-06, "loss": 0.86666274, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.5414235591888428 }, { "auxiliary_loss_clip": 0.01171327, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.05381763, "balance_loss_mlp": 1.02462697, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 2.0862367526245915, "language_loss": 0.79215789, "learning_rate": 2.813852802470202e-06, "loss": 0.81418955, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.52268123626709 }, { "auxiliary_loss_clip": 0.01151802, "auxiliary_loss_mlp": 0.01029077, "balance_loss_clip": 1.05160427, "balance_loss_mlp": 1.02072012, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 1.6747224770050817, "language_loss": 0.72519374, "learning_rate": 2.8131411805781717e-06, "loss": 0.7470026, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 3.405616283416748 }, { "auxiliary_loss_clip": 0.01161074, "auxiliary_loss_mlp": 0.01032413, "balance_loss_clip": 1.05637848, "balance_loss_mlp": 1.02386558, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 2.134008584707854, "language_loss": 0.64308453, "learning_rate": 2.8124294353332707e-06, "loss": 0.66501939, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.6237499713897705 }, { "auxiliary_loss_clip": 0.01145433, "auxiliary_loss_mlp": 0.01028123, "balance_loss_clip": 1.05010819, "balance_loss_mlp": 1.02013612, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 1.571109918653736, "language_loss": 0.77315277, "learning_rate": 2.8117175668434713e-06, "loss": 0.79488832, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 2.6237189769744873 }, { "auxiliary_loss_clip": 0.01185455, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.05648303, "balance_loss_mlp": 1.02172065, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.377936303948983, "language_loss": 0.6999588, "learning_rate": 2.811005575216762e-06, "loss": 0.72211623, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 2.5008528232574463 }, { "auxiliary_loss_clip": 0.01134204, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.04786479, "balance_loss_mlp": 1.02306092, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 1.3735740215581913, "language_loss": 0.78862917, "learning_rate": 2.8102934605611513e-06, "loss": 0.81028128, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 3.4102485179901123 }, { "auxiliary_loss_clip": 0.01164409, "auxiliary_loss_mlp": 0.01037551, "balance_loss_clip": 1.05579066, "balance_loss_mlp": 1.02927804, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 1.8724514536521324, "language_loss": 0.67309213, "learning_rate": 2.8095812229846665e-06, "loss": 0.69511175, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 3.3979415893554688 }, { "auxiliary_loss_clip": 0.0115829, "auxiliary_loss_mlp": 0.01029013, "balance_loss_clip": 1.0519774, "balance_loss_mlp": 1.02116883, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.2409379589455405, "language_loss": 0.68583167, "learning_rate": 2.808868862595355e-06, "loss": 0.70770466, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.5950615406036377 }, { "auxiliary_loss_clip": 0.01175955, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.056319, "balance_loss_mlp": 1.02845085, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 1.7329662004284174, "language_loss": 0.79644591, "learning_rate": 2.8081563795012795e-06, "loss": 0.81856817, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 2.5582728385925293 }, { "auxiliary_loss_clip": 0.01162647, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.05226576, "balance_loss_mlp": 1.02371526, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.8685675237690929, "language_loss": 0.73556638, "learning_rate": 2.807443773810524e-06, "loss": 0.75751305, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 2.69624924659729 }, { "auxiliary_loss_clip": 0.0114114, "auxiliary_loss_mlp": 0.01026062, "balance_loss_clip": 1.05107331, "balance_loss_mlp": 1.01806927, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 1.7447337428549612, "language_loss": 0.89402711, "learning_rate": 2.80673104563119e-06, "loss": 0.91569912, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 2.748058557510376 }, { "auxiliary_loss_clip": 0.011669, "auxiliary_loss_mlp": 0.01035821, "balance_loss_clip": 1.05385423, "balance_loss_mlp": 1.02830458, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 1.969753005943133, "language_loss": 0.79100776, "learning_rate": 2.8060181950713976e-06, "loss": 0.81303495, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.5027782917022705 }, { "auxiliary_loss_clip": 0.01137952, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.04760849, "balance_loss_mlp": 1.01924443, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 1.9444174122914126, "language_loss": 0.81034541, "learning_rate": 2.805305222239286e-06, "loss": 0.83200657, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.576223134994507 }, { "auxiliary_loss_clip": 0.01152114, "auxiliary_loss_mlp": 0.01031545, "balance_loss_clip": 1.05209756, "balance_loss_mlp": 1.02368331, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 1.7141292661656335, "language_loss": 0.7375254, "learning_rate": 2.8045921272430118e-06, "loss": 0.75936198, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.6178882122039795 }, { "auxiliary_loss_clip": 0.01175583, "auxiliary_loss_mlp": 0.01033122, "balance_loss_clip": 1.05281734, "balance_loss_mlp": 1.02506959, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.0322488080583234, "language_loss": 0.76448017, "learning_rate": 2.803878910190753e-06, "loss": 0.78656727, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 2.496628761291504 }, { "auxiliary_loss_clip": 0.01174061, "auxiliary_loss_mlp": 0.01031264, "balance_loss_clip": 1.05425978, "balance_loss_mlp": 1.02347422, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 2.3220962156436378, "language_loss": 0.81860518, "learning_rate": 2.8031655711907017e-06, "loss": 0.84065843, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 2.489475965499878 }, { "auxiliary_loss_clip": 0.01175339, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.05675018, "balance_loss_mlp": 1.02871573, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 2.0879624834716597, "language_loss": 0.80654544, "learning_rate": 2.8024521103510723e-06, "loss": 0.82866806, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 2.527952194213867 }, { "auxiliary_loss_clip": 0.01167637, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.05054593, "balance_loss_mlp": 1.02381158, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 2.2051621238181087, "language_loss": 0.7503497, "learning_rate": 2.8017385277800952e-06, "loss": 0.77234697, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.535352945327759 }, { "auxiliary_loss_clip": 0.01147569, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.05290866, "balance_loss_mlp": 1.02766132, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 3.1782239351241555, "language_loss": 0.75343406, "learning_rate": 2.8010248235860213e-06, "loss": 0.77526832, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.6397595405578613 }, { "auxiliary_loss_clip": 0.01057263, "auxiliary_loss_mlp": 0.0075204, "balance_loss_clip": 1.01655769, "balance_loss_mlp": 1.00021076, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.8542399354502178, "language_loss": 0.62776875, "learning_rate": 2.8003109978771192e-06, "loss": 0.64586174, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.217353582382202 }, { "auxiliary_loss_clip": 0.01132065, "auxiliary_loss_mlp": 0.01029243, "balance_loss_clip": 1.04496861, "balance_loss_mlp": 1.02145267, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 1.8257176357712448, "language_loss": 0.78634548, "learning_rate": 2.799597050761674e-06, "loss": 0.8079586, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.6161904335021973 }, { "auxiliary_loss_clip": 0.01187093, "auxiliary_loss_mlp": 0.01027118, "balance_loss_clip": 1.05687141, "balance_loss_mlp": 1.01894665, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 1.8319235918043548, "language_loss": 0.79166847, "learning_rate": 2.7988829823479924e-06, "loss": 0.81381059, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.5282504558563232 }, { "auxiliary_loss_clip": 0.01147575, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.04751766, "balance_loss_mlp": 1.02592349, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 1.8634495262081499, "language_loss": 0.64134574, "learning_rate": 2.7981687927443976e-06, "loss": 0.66315854, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.542198657989502 }, { "auxiliary_loss_clip": 0.01169023, "auxiliary_loss_mlp": 0.01026763, "balance_loss_clip": 1.05080342, "balance_loss_mlp": 1.01938653, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 2.5990740733545272, "language_loss": 0.85583186, "learning_rate": 2.797454482059231e-06, "loss": 0.87778968, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.556319236755371 }, { "auxiliary_loss_clip": 0.01186695, "auxiliary_loss_mlp": 0.01028999, "balance_loss_clip": 1.05638611, "balance_loss_mlp": 1.02079785, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 1.6803844136771386, "language_loss": 0.84465158, "learning_rate": 2.7967400504008537e-06, "loss": 0.86680853, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.48679518699646 }, { "auxiliary_loss_clip": 0.01044955, "auxiliary_loss_mlp": 0.01016988, "balance_loss_clip": 1.02924347, "balance_loss_mlp": 1.01564682, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.8243668827206219, "language_loss": 0.57504642, "learning_rate": 2.7960254978776456e-06, "loss": 0.59566581, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.2004919052124023 }, { "auxiliary_loss_clip": 0.01186616, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.05658269, "balance_loss_mlp": 1.02491832, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 2.049442158131418, "language_loss": 0.81632876, "learning_rate": 2.7953108245980006e-06, "loss": 0.8385275, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.480072021484375 }, { "auxiliary_loss_clip": 0.01149849, "auxiliary_loss_mlp": 0.01026738, "balance_loss_clip": 1.05305195, "balance_loss_mlp": 1.01921582, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.7231215691533082, "language_loss": 0.73714697, "learning_rate": 2.7945960306703365e-06, "loss": 0.7589128, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 3.423224449157715 }, { "auxiliary_loss_clip": 0.01172552, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.05309439, "balance_loss_mlp": 1.02129054, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 1.7315721883142428, "language_loss": 0.65436977, "learning_rate": 2.7938811162030865e-06, "loss": 0.67638135, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.5831713676452637 }, { "auxiliary_loss_clip": 0.01168905, "auxiliary_loss_mlp": 0.01022528, "balance_loss_clip": 1.05413294, "balance_loss_mlp": 1.01517868, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 1.7140281533287955, "language_loss": 0.82570899, "learning_rate": 2.793166081304702e-06, "loss": 0.84762335, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 2.601123094558716 }, { "auxiliary_loss_clip": 0.01144153, "auxiliary_loss_mlp": 0.01037988, "balance_loss_clip": 1.04788649, "balance_loss_mlp": 1.02953637, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 2.1550561064969553, "language_loss": 0.82250309, "learning_rate": 2.7924509260836543e-06, "loss": 0.84432447, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 2.608346700668335 }, { "auxiliary_loss_clip": 0.01142908, "auxiliary_loss_mlp": 0.01031108, "balance_loss_clip": 1.05112755, "balance_loss_mlp": 1.02360368, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 1.5261054476345841, "language_loss": 0.6843884, "learning_rate": 2.791735650648431e-06, "loss": 0.70612854, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 3.4508845806121826 }, { "auxiliary_loss_clip": 0.01152826, "auxiliary_loss_mlp": 0.01029852, "balance_loss_clip": 1.05173981, "balance_loss_mlp": 1.0221988, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 1.7812232190187807, "language_loss": 0.74393767, "learning_rate": 2.791020255107538e-06, "loss": 0.76576447, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 4.14414381980896 }, { "auxiliary_loss_clip": 0.01133573, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.04400384, "balance_loss_mlp": 1.02294874, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.54670484075314, "language_loss": 0.80433655, "learning_rate": 2.7903047395695023e-06, "loss": 0.82598138, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.6313257217407227 }, { "auxiliary_loss_clip": 0.01170655, "auxiliary_loss_mlp": 0.00761787, "balance_loss_clip": 1.05544007, "balance_loss_mlp": 1.00022674, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 3.3450481839136548, "language_loss": 0.89830005, "learning_rate": 2.789589104142865e-06, "loss": 0.91762447, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 2.565805673599243 }, { "auxiliary_loss_clip": 0.01143255, "auxiliary_loss_mlp": 0.01032956, "balance_loss_clip": 1.05040479, "balance_loss_mlp": 1.02498722, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 1.6787353610362246, "language_loss": 0.76628661, "learning_rate": 2.7888733489361895e-06, "loss": 0.78804874, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 2.610018014907837 }, { "auxiliary_loss_clip": 0.01071529, "auxiliary_loss_mlp": 0.01002085, "balance_loss_clip": 1.01368737, "balance_loss_mlp": 1.00066018, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.7313670663724887, "language_loss": 0.58720273, "learning_rate": 2.788157474058054e-06, "loss": 0.60793889, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 3.1786117553710938 }, { "auxiliary_loss_clip": 0.01180056, "auxiliary_loss_mlp": 0.01037427, "balance_loss_clip": 1.05389822, "balance_loss_mlp": 1.02942824, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 1.5157742346654828, "language_loss": 0.698551, "learning_rate": 2.7874414796170555e-06, "loss": 0.72072577, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.546280860900879 }, { "auxiliary_loss_clip": 0.01165346, "auxiliary_loss_mlp": 0.01029829, "balance_loss_clip": 1.05168879, "balance_loss_mlp": 1.02123404, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 2.6295418538196493, "language_loss": 0.83954549, "learning_rate": 2.7867253657218113e-06, "loss": 0.86149728, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.4912102222442627 }, { "auxiliary_loss_clip": 0.0115172, "auxiliary_loss_mlp": 0.00761536, "balance_loss_clip": 1.0488956, "balance_loss_mlp": 1.0002327, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 1.589552503655944, "language_loss": 0.7289483, "learning_rate": 2.7860091324809544e-06, "loss": 0.74808085, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.6118240356445312 }, { "auxiliary_loss_clip": 0.0116889, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.05438185, "balance_loss_mlp": 1.02130473, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 1.9764949992350458, "language_loss": 0.80992258, "learning_rate": 2.7852927800031377e-06, "loss": 0.8318994, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 2.584456205368042 }, { "auxiliary_loss_clip": 0.01155526, "auxiliary_loss_mlp": 0.01033756, "balance_loss_clip": 1.04942966, "balance_loss_mlp": 1.02587605, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 1.7277352806985147, "language_loss": 0.82823861, "learning_rate": 2.7845763083970298e-06, "loss": 0.85013139, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.610621690750122 }, { "auxiliary_loss_clip": 0.0115915, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.04895723, "balance_loss_mlp": 1.02110636, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 2.049054980445831, "language_loss": 0.8217243, "learning_rate": 2.7838597177713205e-06, "loss": 0.84361219, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 2.5462067127227783 }, { "auxiliary_loss_clip": 0.01102515, "auxiliary_loss_mlp": 0.01033797, "balance_loss_clip": 1.04747891, "balance_loss_mlp": 1.02555919, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 1.6570631191782041, "language_loss": 0.73468232, "learning_rate": 2.7831430082347143e-06, "loss": 0.75604546, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.653055191040039 }, { "auxiliary_loss_clip": 0.01171748, "auxiliary_loss_mlp": 0.00760647, "balance_loss_clip": 1.05418539, "balance_loss_mlp": 1.00023413, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 2.033095262267449, "language_loss": 0.81821829, "learning_rate": 2.7824261798959373e-06, "loss": 0.83754218, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.537879228591919 }, { "auxiliary_loss_clip": 0.01156076, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.04867005, "balance_loss_mlp": 1.02543056, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 1.920746822916225, "language_loss": 0.79810292, "learning_rate": 2.78170923286373e-06, "loss": 0.82000202, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.579200506210327 }, { "auxiliary_loss_clip": 0.01088983, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.04511571, "balance_loss_mlp": 1.02284873, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 3.3187037399120114, "language_loss": 0.83940423, "learning_rate": 2.780992167246854e-06, "loss": 0.8606118, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.8390085697174072 }, { "auxiliary_loss_clip": 0.01052243, "auxiliary_loss_mlp": 0.01006461, "balance_loss_clip": 1.01272607, "balance_loss_mlp": 1.00492287, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.9880709424166326, "language_loss": 0.72163969, "learning_rate": 2.7802749831540883e-06, "loss": 0.74222672, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.17475962638855 }, { "auxiliary_loss_clip": 0.01131857, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.05002081, "balance_loss_mlp": 1.02241516, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 2.3666916695159808, "language_loss": 0.81954634, "learning_rate": 2.7795576806942268e-06, "loss": 0.84115803, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.6309120655059814 }, { "auxiliary_loss_clip": 0.01052229, "auxiliary_loss_mlp": 0.01011248, "balance_loss_clip": 1.02009797, "balance_loss_mlp": 1.00961447, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.7612720311580335, "language_loss": 0.54884338, "learning_rate": 2.778840259976085e-06, "loss": 0.56947815, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.1145927906036377 }, { "auxiliary_loss_clip": 0.01170151, "auxiliary_loss_mlp": 0.01038557, "balance_loss_clip": 1.05212569, "balance_loss_mlp": 1.02948475, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 2.2852955717342778, "language_loss": 0.7750895, "learning_rate": 2.778122721108495e-06, "loss": 0.79717648, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.52213454246521 }, { "auxiliary_loss_clip": 0.01167518, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.05392969, "balance_loss_mlp": 1.02386248, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 1.954630301929467, "language_loss": 0.8884896, "learning_rate": 2.7774050642003076e-06, "loss": 0.91048223, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.5816099643707275 }, { "auxiliary_loss_clip": 0.01188228, "auxiliary_loss_mlp": 0.01037495, "balance_loss_clip": 1.05711007, "balance_loss_mlp": 1.02876329, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 2.1342684447936326, "language_loss": 0.93349922, "learning_rate": 2.7766872893603896e-06, "loss": 0.95575649, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 3.335916519165039 }, { "auxiliary_loss_clip": 0.01170113, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.05253899, "balance_loss_mlp": 1.01929116, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 1.905819965160923, "language_loss": 0.73136735, "learning_rate": 2.7759693966976275e-06, "loss": 0.75333482, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.536393165588379 }, { "auxiliary_loss_clip": 0.01139192, "auxiliary_loss_mlp": 0.01034649, "balance_loss_clip": 1.04852343, "balance_loss_mlp": 1.02607751, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 1.8057346217744474, "language_loss": 0.85341215, "learning_rate": 2.7752513863209242e-06, "loss": 0.87515068, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.569824695587158 }, { "auxiliary_loss_clip": 0.01154311, "auxiliary_loss_mlp": 0.00761434, "balance_loss_clip": 1.05439138, "balance_loss_mlp": 1.0002563, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.5864133084335696, "language_loss": 0.84424812, "learning_rate": 2.774533258339203e-06, "loss": 0.86340559, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 2.570836067199707 }, { "auxiliary_loss_clip": 0.01126303, "auxiliary_loss_mlp": 0.01036758, "balance_loss_clip": 1.04465139, "balance_loss_mlp": 1.02800763, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 2.123510192311751, "language_loss": 0.79586822, "learning_rate": 2.7738150128614014e-06, "loss": 0.8174988, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 2.5871896743774414 }, { "auxiliary_loss_clip": 0.01135282, "auxiliary_loss_mlp": 0.01031118, "balance_loss_clip": 1.05268812, "balance_loss_mlp": 1.02290416, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 1.7772748716760023, "language_loss": 0.89541495, "learning_rate": 2.7730966499964777e-06, "loss": 0.91707897, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.5985493659973145 }, { "auxiliary_loss_clip": 0.01185018, "auxiliary_loss_mlp": 0.01030495, "balance_loss_clip": 1.05404353, "balance_loss_mlp": 1.02178049, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 2.295951458867603, "language_loss": 0.80236995, "learning_rate": 2.772378169853408e-06, "loss": 0.82452506, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 4.094515085220337 }, { "auxiliary_loss_clip": 0.01145654, "auxiliary_loss_mlp": 0.01034374, "balance_loss_clip": 1.053671, "balance_loss_mlp": 1.02649474, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 2.0678292150447812, "language_loss": 0.74176633, "learning_rate": 2.771659572541183e-06, "loss": 0.76356661, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.5727617740631104 }, { "auxiliary_loss_clip": 0.01177775, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 1.05920088, "balance_loss_mlp": 1.02429748, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 3.350030591112571, "language_loss": 0.86953664, "learning_rate": 2.7709408581688143e-06, "loss": 0.8916291, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 2.5279858112335205 }, { "auxiliary_loss_clip": 0.01147091, "auxiliary_loss_mlp": 0.01030275, "balance_loss_clip": 1.05213046, "balance_loss_mlp": 1.02210045, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 2.1337871185043897, "language_loss": 0.87686706, "learning_rate": 2.7702220268453307e-06, "loss": 0.89864075, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 2.6120400428771973 }, { "auxiliary_loss_clip": 0.01162462, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.05328667, "balance_loss_mlp": 1.02171075, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 2.3648190785906134, "language_loss": 0.84555864, "learning_rate": 2.7695030786797785e-06, "loss": 0.86747992, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 2.5464043617248535 }, { "auxiliary_loss_clip": 0.01124103, "auxiliary_loss_mlp": 0.01035165, "balance_loss_clip": 1.04748297, "balance_loss_mlp": 1.02693975, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 1.9286264774730058, "language_loss": 0.74821699, "learning_rate": 2.7687840137812206e-06, "loss": 0.7698096, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.6429104804992676 }, { "auxiliary_loss_clip": 0.01057816, "auxiliary_loss_mlp": 0.0100833, "balance_loss_clip": 1.0129385, "balance_loss_mlp": 1.00675666, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.7978615358698185, "language_loss": 0.62118399, "learning_rate": 2.7680648322587395e-06, "loss": 0.64184546, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.1447014808654785 }, { "auxiliary_loss_clip": 0.01185082, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.05548275, "balance_loss_mlp": 1.02041137, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 3.2010334533145963, "language_loss": 0.81020206, "learning_rate": 2.7673455342214334e-06, "loss": 0.83234084, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.4701485633850098 }, { "auxiliary_loss_clip": 0.0117424, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.0556078, "balance_loss_mlp": 1.02489114, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 2.1309175649715897, "language_loss": 0.76431823, "learning_rate": 2.7666261197784198e-06, "loss": 0.78638697, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 2.5363118648529053 }, { "auxiliary_loss_clip": 0.01150212, "auxiliary_loss_mlp": 0.01026077, "balance_loss_clip": 1.05297697, "balance_loss_mlp": 1.01857853, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 1.9844937439558237, "language_loss": 0.76837862, "learning_rate": 2.7659065890388336e-06, "loss": 0.79014146, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.577951431274414 }, { "auxiliary_loss_clip": 0.01160623, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.05368352, "balance_loss_mlp": 1.02164793, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 2.2151005199006804, "language_loss": 0.85047209, "learning_rate": 2.7651869421118266e-06, "loss": 0.87237322, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 2.566519260406494 }, { "auxiliary_loss_clip": 0.01174999, "auxiliary_loss_mlp": 0.0103017, "balance_loss_clip": 1.05638647, "balance_loss_mlp": 1.02199173, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 1.6342999295201055, "language_loss": 0.82774508, "learning_rate": 2.76446717910657e-06, "loss": 0.84979671, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.546081066131592 }, { "auxiliary_loss_clip": 0.01166774, "auxiliary_loss_mlp": 0.01027753, "balance_loss_clip": 1.05252433, "balance_loss_mlp": 1.01981974, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.331022270642384, "language_loss": 0.76435333, "learning_rate": 2.763747300132249e-06, "loss": 0.78629863, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.485366106033325 }, { "auxiliary_loss_clip": 0.01187725, "auxiliary_loss_mlp": 0.01035974, "balance_loss_clip": 1.05767298, "balance_loss_mlp": 1.02789736, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 1.6703445428036323, "language_loss": 0.86632192, "learning_rate": 2.7630273052980704e-06, "loss": 0.88855898, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.4960572719573975 }, { "auxiliary_loss_clip": 0.01148258, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.05256605, "balance_loss_mlp": 1.02515125, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 2.3811959633599935, "language_loss": 0.67455351, "learning_rate": 2.7623071947132554e-06, "loss": 0.69636929, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.546435594558716 }, { "auxiliary_loss_clip": 0.01162527, "auxiliary_loss_mlp": 0.01031639, "balance_loss_clip": 1.05239069, "balance_loss_mlp": 1.02348542, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 2.0566377453372477, "language_loss": 0.78740907, "learning_rate": 2.7615869684870458e-06, "loss": 0.80935067, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.573173761367798 }, { "auxiliary_loss_clip": 0.01171875, "auxiliary_loss_mlp": 0.01033458, "balance_loss_clip": 1.05555797, "balance_loss_mlp": 1.02532792, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.6225039422081649, "language_loss": 0.8419075, "learning_rate": 2.7608666267286986e-06, "loss": 0.8639608, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.5638794898986816 }, { "auxiliary_loss_clip": 0.01110819, "auxiliary_loss_mlp": 0.01030454, "balance_loss_clip": 1.04595852, "balance_loss_mlp": 1.02205586, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.0236008008002995, "language_loss": 0.86290181, "learning_rate": 2.760146169547489e-06, "loss": 0.88431454, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.6307291984558105 }, { "auxiliary_loss_clip": 0.0115997, "auxiliary_loss_mlp": 0.01042985, "balance_loss_clip": 1.05572939, "balance_loss_mlp": 1.03413987, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.4231476928590283, "language_loss": 0.76266265, "learning_rate": 2.75942559705271e-06, "loss": 0.78469217, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.5898656845092773 }, { "auxiliary_loss_clip": 0.01167257, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.05377078, "balance_loss_mlp": 1.0212338, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 1.9167890177164317, "language_loss": 0.8911733, "learning_rate": 2.7587049093536713e-06, "loss": 0.91314238, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.5155842304229736 }, { "auxiliary_loss_clip": 0.01173582, "auxiliary_loss_mlp": 0.01037013, "balance_loss_clip": 1.05436468, "balance_loss_mlp": 1.02945256, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 1.8266131499792055, "language_loss": 0.80683261, "learning_rate": 2.757984106559701e-06, "loss": 0.82893848, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.520636796951294 }, { "auxiliary_loss_clip": 0.01151428, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.05452752, "balance_loss_mlp": 1.02863252, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 2.892297521277485, "language_loss": 0.71352828, "learning_rate": 2.7572631887801446e-06, "loss": 0.73541194, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 3.4727320671081543 }, { "auxiliary_loss_clip": 0.01171379, "auxiliary_loss_mlp": 0.01026225, "balance_loss_clip": 1.05533028, "balance_loss_mlp": 1.01757634, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 1.7365804446576287, "language_loss": 0.76525402, "learning_rate": 2.7565421561243654e-06, "loss": 0.78723007, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.5473058223724365 }, { "auxiliary_loss_clip": 0.0113478, "auxiliary_loss_mlp": 0.01030105, "balance_loss_clip": 1.04849851, "balance_loss_mlp": 1.02239203, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 2.3900522875107746, "language_loss": 0.82140088, "learning_rate": 2.7558210087017413e-06, "loss": 0.84304976, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 2.616666078567505 }, { "auxiliary_loss_clip": 0.01139168, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.05337, "balance_loss_mlp": 1.01939106, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 2.008667809664639, "language_loss": 0.73117793, "learning_rate": 2.7550997466216724e-06, "loss": 0.75285262, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 2.5922436714172363 }, { "auxiliary_loss_clip": 0.01153873, "auxiliary_loss_mlp": 0.01028839, "balance_loss_clip": 1.05578983, "balance_loss_mlp": 1.02035761, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 2.457065533447961, "language_loss": 0.80941987, "learning_rate": 2.7543783699935714e-06, "loss": 0.83124697, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 3.4038445949554443 }, { "auxiliary_loss_clip": 0.01171445, "auxiliary_loss_mlp": 0.01026741, "balance_loss_clip": 1.05879951, "balance_loss_mlp": 1.0189209, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 2.3971123935362137, "language_loss": 0.85789573, "learning_rate": 2.753656878926872e-06, "loss": 0.87987757, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 4.071390151977539 }, { "auxiliary_loss_clip": 0.01145009, "auxiliary_loss_mlp": 0.01036039, "balance_loss_clip": 1.04919922, "balance_loss_mlp": 1.02733636, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 1.9408406446712727, "language_loss": 0.74345124, "learning_rate": 2.752935273531023e-06, "loss": 0.76526165, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.532723903656006 }, { "auxiliary_loss_clip": 0.01174694, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 1.0588572, "balance_loss_mlp": 1.0243156, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 1.7887562831309047, "language_loss": 0.78467679, "learning_rate": 2.752213553915492e-06, "loss": 0.80675572, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 2.5259244441986084 }, { "auxiliary_loss_clip": 0.01052108, "auxiliary_loss_mlp": 0.010026, "balance_loss_clip": 1.0171566, "balance_loss_mlp": 1.00117588, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.8129821457787784, "language_loss": 0.66054308, "learning_rate": 2.751491720189762e-06, "loss": 0.68109012, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 3.082036018371582 }, { "auxiliary_loss_clip": 0.0115708, "auxiliary_loss_mlp": 0.00761619, "balance_loss_clip": 1.05355525, "balance_loss_mlp": 1.00032139, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.030760166406356, "language_loss": 0.91710925, "learning_rate": 2.7507697724633364e-06, "loss": 0.93629628, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 2.5268332958221436 }, { "auxiliary_loss_clip": 0.01042586, "auxiliary_loss_mlp": 0.01003243, "balance_loss_clip": 1.02218175, "balance_loss_mlp": 1.00170541, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.7735171579267955, "language_loss": 0.54697073, "learning_rate": 2.7500477108457327e-06, "loss": 0.56742907, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 2.972369432449341 }, { "auxiliary_loss_clip": 0.01167161, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.05281234, "balance_loss_mlp": 1.02270377, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 1.9110668256920045, "language_loss": 0.80341047, "learning_rate": 2.7493255354464877e-06, "loss": 0.82538813, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.5592949390411377 }, { "auxiliary_loss_clip": 0.01058324, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.03888309, "balance_loss_mlp": 1.02319145, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 1.7489507792176195, "language_loss": 0.75932407, "learning_rate": 2.748603246375156e-06, "loss": 0.78022116, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 2.9225733280181885 }, { "auxiliary_loss_clip": 0.01185642, "auxiliary_loss_mlp": 0.01033435, "balance_loss_clip": 1.0567919, "balance_loss_mlp": 1.02559066, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 2.4727723494217444, "language_loss": 0.69935441, "learning_rate": 2.7478808437413055e-06, "loss": 0.72154522, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 2.6937520503997803 }, { "auxiliary_loss_clip": 0.01127575, "auxiliary_loss_mlp": 0.01030009, "balance_loss_clip": 1.05220294, "balance_loss_mlp": 1.02206409, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 2.3214455503447873, "language_loss": 0.65805173, "learning_rate": 2.7471583276545263e-06, "loss": 0.6796276, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.6683387756347656 }, { "auxiliary_loss_clip": 0.01157527, "auxiliary_loss_mlp": 0.01028493, "balance_loss_clip": 1.05259764, "balance_loss_mlp": 1.02069128, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 1.9290801131675008, "language_loss": 0.70496756, "learning_rate": 2.7464356982244224e-06, "loss": 0.72682774, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 2.560967206954956 }, { "auxiliary_loss_clip": 0.01064903, "auxiliary_loss_mlp": 0.01001034, "balance_loss_clip": 1.02266634, "balance_loss_mlp": 0.99963969, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7988742560867907, "language_loss": 0.61715102, "learning_rate": 2.745712955560617e-06, "loss": 0.63781041, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.0620484352111816 }, { "auxiliary_loss_clip": 0.01115167, "auxiliary_loss_mlp": 0.01026017, "balance_loss_clip": 1.0485487, "balance_loss_mlp": 1.01763105, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 2.6489340734253477, "language_loss": 0.76873058, "learning_rate": 2.7449900997727496e-06, "loss": 0.79014242, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 2.6332030296325684 }, { "auxiliary_loss_clip": 0.01151733, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.05304015, "balance_loss_mlp": 1.02197385, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 1.6853211034812077, "language_loss": 0.8411541, "learning_rate": 2.744267130970476e-06, "loss": 0.86297047, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.5914056301116943 }, { "auxiliary_loss_clip": 0.01152548, "auxiliary_loss_mlp": 0.01028894, "balance_loss_clip": 1.05313015, "balance_loss_mlp": 1.02047741, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 1.927759408763687, "language_loss": 0.77471602, "learning_rate": 2.7435440492634697e-06, "loss": 0.79653049, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.5737717151641846 }, { "auxiliary_loss_clip": 0.01155242, "auxiliary_loss_mlp": 0.01030262, "balance_loss_clip": 1.0521692, "balance_loss_mlp": 1.02075553, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 1.9592512502219983, "language_loss": 0.67389464, "learning_rate": 2.7428208547614228e-06, "loss": 0.69574964, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.573829412460327 }, { "auxiliary_loss_clip": 0.01171515, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 1.05674767, "balance_loss_mlp": 1.02351725, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 1.9418032805684147, "language_loss": 0.77249783, "learning_rate": 2.742097547574043e-06, "loss": 0.79453015, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.5258736610412598 }, { "auxiliary_loss_clip": 0.01160599, "auxiliary_loss_mlp": 0.00761514, "balance_loss_clip": 1.05258822, "balance_loss_mlp": 1.00033844, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 2.1180484698303172, "language_loss": 0.77222586, "learning_rate": 2.7413741278110544e-06, "loss": 0.79144698, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.568763256072998 }, { "auxiliary_loss_clip": 0.01159932, "auxiliary_loss_mlp": 0.01030806, "balance_loss_clip": 1.05487466, "balance_loss_mlp": 1.02262247, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 2.1326684185653253, "language_loss": 0.68663478, "learning_rate": 2.7406505955822016e-06, "loss": 0.70854211, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.7219078540802 }, { "auxiliary_loss_clip": 0.01152981, "auxiliary_loss_mlp": 0.010309, "balance_loss_clip": 1.05048513, "balance_loss_mlp": 1.02288949, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 3.2728819154551605, "language_loss": 0.66288781, "learning_rate": 2.7399269509972415e-06, "loss": 0.68472666, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.518404960632324 }, { "auxiliary_loss_clip": 0.01145745, "auxiliary_loss_mlp": 0.01032356, "balance_loss_clip": 1.04636765, "balance_loss_mlp": 1.02318907, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 2.420787134398533, "language_loss": 0.85568476, "learning_rate": 2.7392031941659514e-06, "loss": 0.87746578, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 3.3499300479888916 }, { "auxiliary_loss_clip": 0.01157677, "auxiliary_loss_mlp": 0.01028967, "balance_loss_clip": 1.05691862, "balance_loss_mlp": 1.02098048, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 2.027237978263829, "language_loss": 0.85940158, "learning_rate": 2.7384793251981244e-06, "loss": 0.88126802, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.6079113483428955 }, { "auxiliary_loss_clip": 0.01174385, "auxiliary_loss_mlp": 0.01031654, "balance_loss_clip": 1.05407584, "balance_loss_mlp": 1.02394152, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 1.7148447482353042, "language_loss": 0.80605197, "learning_rate": 2.737755344203571e-06, "loss": 0.82811236, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.717622756958008 }, { "auxiliary_loss_clip": 0.01171739, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.05520117, "balance_loss_mlp": 1.02332354, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 1.5056439760959839, "language_loss": 0.79993337, "learning_rate": 2.7370312512921186e-06, "loss": 0.8219676, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 2.5878875255584717 }, { "auxiliary_loss_clip": 0.01156642, "auxiliary_loss_mlp": 0.01028516, "balance_loss_clip": 1.05029678, "balance_loss_mlp": 1.0195868, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 2.191526585801904, "language_loss": 0.76624763, "learning_rate": 2.736307046573611e-06, "loss": 0.78809923, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.5447206497192383 }, { "auxiliary_loss_clip": 0.01180907, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.05361164, "balance_loss_mlp": 1.02136827, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 1.7069022897227117, "language_loss": 0.81826562, "learning_rate": 2.73558273015791e-06, "loss": 0.84036744, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 3.293494939804077 }, { "auxiliary_loss_clip": 0.0118664, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.05639482, "balance_loss_mlp": 1.02276373, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 2.266805566956406, "language_loss": 0.71389675, "learning_rate": 2.734858302154894e-06, "loss": 0.73607397, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 4.075145959854126 }, { "auxiliary_loss_clip": 0.01151747, "auxiliary_loss_mlp": 0.01028068, "balance_loss_clip": 1.05188441, "balance_loss_mlp": 1.02024746, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 4.478554232385496, "language_loss": 0.76242644, "learning_rate": 2.734133762674457e-06, "loss": 0.78422457, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.555310010910034 }, { "auxiliary_loss_clip": 0.01158735, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.05432987, "balance_loss_mlp": 1.02046382, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 1.9297441681361427, "language_loss": 0.70583892, "learning_rate": 2.7334091118265124e-06, "loss": 0.7277112, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 2.666428327560425 }, { "auxiliary_loss_clip": 0.0106552, "auxiliary_loss_mlp": 0.01010123, "balance_loss_clip": 1.01775646, "balance_loss_mlp": 1.0086863, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6817551335941444, "language_loss": 0.5782308, "learning_rate": 2.732684349720989e-06, "loss": 0.59898722, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 3.121854305267334 }, { "auxiliary_loss_clip": 0.01142268, "auxiliary_loss_mlp": 0.01030544, "balance_loss_clip": 1.04986453, "balance_loss_mlp": 1.02214551, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 1.7137408975460526, "language_loss": 0.7505728, "learning_rate": 2.7319594764678318e-06, "loss": 0.77230096, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.6383843421936035 }, { "auxiliary_loss_clip": 0.01128991, "auxiliary_loss_mlp": 0.01035486, "balance_loss_clip": 1.04957819, "balance_loss_mlp": 1.02675426, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 11.017534996621418, "language_loss": 0.83142871, "learning_rate": 2.7312344921770044e-06, "loss": 0.85307348, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 2.659432888031006 }, { "auxiliary_loss_clip": 0.01154859, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.05069232, "balance_loss_mlp": 1.01819897, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 1.8086302134730927, "language_loss": 0.78571028, "learning_rate": 2.7305093969584857e-06, "loss": 0.80751634, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.5431511402130127 }, { "auxiliary_loss_clip": 0.01163549, "auxiliary_loss_mlp": 0.01031459, "balance_loss_clip": 1.05208802, "balance_loss_mlp": 1.02368653, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 1.9090835762648626, "language_loss": 0.80103266, "learning_rate": 2.729784190922272e-06, "loss": 0.82298273, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.7020463943481445 }, { "auxiliary_loss_clip": 0.01048096, "auxiliary_loss_mlp": 0.01002681, "balance_loss_clip": 1.01411486, "balance_loss_mlp": 1.0011189, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 0.9460573816378759, "language_loss": 0.57178736, "learning_rate": 2.729058874178378e-06, "loss": 0.59229517, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.199441432952881 }, { "auxiliary_loss_clip": 0.01160955, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 1.0541923, "balance_loss_mlp": 1.0194838, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 2.0373415527164767, "language_loss": 0.68971497, "learning_rate": 2.7283334468368315e-06, "loss": 0.71160471, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 2.6941609382629395 }, { "auxiliary_loss_clip": 0.01081471, "auxiliary_loss_mlp": 0.01027437, "balance_loss_clip": 1.03998208, "balance_loss_mlp": 1.01893127, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 1.871062557544671, "language_loss": 0.72889221, "learning_rate": 2.72760790900768e-06, "loss": 0.74998128, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 2.7893972396850586 }, { "auxiliary_loss_clip": 0.01187092, "auxiliary_loss_mlp": 0.01029321, "balance_loss_clip": 1.05846369, "balance_loss_mlp": 1.0211848, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 1.8215921715513685, "language_loss": 0.79121959, "learning_rate": 2.7268822608009875e-06, "loss": 0.8133837, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 2.8155364990234375 }, { "auxiliary_loss_clip": 0.01147576, "auxiliary_loss_mlp": 0.01030505, "balance_loss_clip": 1.05193067, "balance_loss_mlp": 1.02252376, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 1.9811420240404083, "language_loss": 0.78053224, "learning_rate": 2.726156502326834e-06, "loss": 0.80231303, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 2.7047863006591797 }, { "auxiliary_loss_clip": 0.01020533, "auxiliary_loss_mlp": 0.01001206, "balance_loss_clip": 1.01619411, "balance_loss_mlp": 0.99963284, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.69538167002949, "language_loss": 0.60260153, "learning_rate": 2.725430633695316e-06, "loss": 0.62281895, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.2746493816375732 }, { "auxiliary_loss_clip": 0.01071055, "auxiliary_loss_mlp": 0.01004128, "balance_loss_clip": 1.01403213, "balance_loss_mlp": 1.00280428, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 0.9087665369101099, "language_loss": 0.58001769, "learning_rate": 2.7247046550165485e-06, "loss": 0.60076952, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.109802484512329 }, { "auxiliary_loss_clip": 0.01186647, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.05826902, "balance_loss_mlp": 1.02427006, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.4113942120544534, "language_loss": 0.75682831, "learning_rate": 2.7239785664006606e-06, "loss": 0.77902859, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.5876948833465576 }, { "auxiliary_loss_clip": 0.01061021, "auxiliary_loss_mlp": 0.01005247, "balance_loss_clip": 1.01361299, "balance_loss_mlp": 1.00396562, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.9866681568630786, "language_loss": 0.61797071, "learning_rate": 2.7232523679578002e-06, "loss": 0.63863337, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.1671290397644043 }, { "auxiliary_loss_clip": 0.01172267, "auxiliary_loss_mlp": 0.01037208, "balance_loss_clip": 1.05733252, "balance_loss_mlp": 1.02914906, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.123020252981161, "language_loss": 0.79410017, "learning_rate": 2.7225260597981295e-06, "loss": 0.81619489, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.5278756618499756 }, { "auxiliary_loss_clip": 0.01141591, "auxiliary_loss_mlp": 0.00762415, "balance_loss_clip": 1.0525912, "balance_loss_mlp": 1.00034368, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 2.858855117760107, "language_loss": 0.78497851, "learning_rate": 2.721799642031831e-06, "loss": 0.80401862, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.5957112312316895 }, { "auxiliary_loss_clip": 0.01161648, "auxiliary_loss_mlp": 0.01028496, "balance_loss_clip": 1.05198014, "balance_loss_mlp": 1.02041376, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 2.0133398303646404, "language_loss": 0.77450228, "learning_rate": 2.721073114769101e-06, "loss": 0.79640365, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.560335159301758 }, { "auxiliary_loss_clip": 0.01141674, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.05259871, "balance_loss_mlp": 1.02225733, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 1.8403792208800422, "language_loss": 0.74809957, "learning_rate": 2.7203464781201523e-06, "loss": 0.76981556, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 3.7842981815338135 }, { "auxiliary_loss_clip": 0.01185894, "auxiliary_loss_mlp": 0.01033408, "balance_loss_clip": 1.05662525, "balance_loss_mlp": 1.02587438, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 2.043070867849423, "language_loss": 0.78008741, "learning_rate": 2.719619732195215e-06, "loss": 0.80228043, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.618217706680298 }, { "auxiliary_loss_clip": 0.01142262, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.05210829, "balance_loss_mlp": 1.02658105, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 1.3947879171429154, "language_loss": 0.72701442, "learning_rate": 2.7188928771045377e-06, "loss": 0.74878335, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 2.6293091773986816 }, { "auxiliary_loss_clip": 0.011374, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.04968345, "balance_loss_mlp": 1.02026486, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 1.60668316265841, "language_loss": 0.80158603, "learning_rate": 2.7181659129583815e-06, "loss": 0.82324582, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 2.6706857681274414 }, { "auxiliary_loss_clip": 0.01146988, "auxiliary_loss_mlp": 0.01024279, "balance_loss_clip": 1.04792571, "balance_loss_mlp": 1.01620317, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 1.6364106908211977, "language_loss": 0.7549026, "learning_rate": 2.7174388398670276e-06, "loss": 0.77661526, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 2.577338933944702 }, { "auxiliary_loss_clip": 0.01182108, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.05257559, "balance_loss_mlp": 1.02037537, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 1.9986596620399, "language_loss": 0.91736746, "learning_rate": 2.716711657940773e-06, "loss": 0.93947577, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 4.088343620300293 }, { "auxiliary_loss_clip": 0.01039451, "auxiliary_loss_mlp": 0.01002545, "balance_loss_clip": 1.0159111, "balance_loss_mlp": 1.00129974, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8148904899020343, "language_loss": 0.5656327, "learning_rate": 2.7159843672899284e-06, "loss": 0.58605266, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 4.11398458480835 }, { "auxiliary_loss_clip": 0.01174022, "auxiliary_loss_mlp": 0.01030054, "balance_loss_clip": 1.05736732, "balance_loss_mlp": 1.02201939, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 1.9537745856659812, "language_loss": 0.81331533, "learning_rate": 2.715256968024825e-06, "loss": 0.83535612, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.516437292098999 }, { "auxiliary_loss_clip": 0.01161655, "auxiliary_loss_mlp": 0.01031792, "balance_loss_clip": 1.05416083, "balance_loss_mlp": 1.02399015, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.7446634914750148, "language_loss": 0.82099319, "learning_rate": 2.7145294602558083e-06, "loss": 0.84292769, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 2.6400160789489746 }, { "auxiliary_loss_clip": 0.0117163, "auxiliary_loss_mlp": 0.01029109, "balance_loss_clip": 1.0546658, "balance_loss_mlp": 1.02030516, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 1.8938298206673478, "language_loss": 0.70647985, "learning_rate": 2.713801844093241e-06, "loss": 0.72848725, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.6522574424743652 }, { "auxiliary_loss_clip": 0.01171881, "auxiliary_loss_mlp": 0.01026741, "balance_loss_clip": 1.05360007, "balance_loss_mlp": 1.01845562, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 1.9235033156095243, "language_loss": 0.88549435, "learning_rate": 2.7130741196475014e-06, "loss": 0.90748054, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 2.5933449268341064 }, { "auxiliary_loss_clip": 0.01159105, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.05608964, "balance_loss_mlp": 1.02011597, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 1.8713785663782259, "language_loss": 0.79032999, "learning_rate": 2.7123462870289848e-06, "loss": 0.81220555, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.696370840072632 }, { "auxiliary_loss_clip": 0.01159482, "auxiliary_loss_mlp": 0.01029999, "balance_loss_clip": 1.05209994, "balance_loss_mlp": 1.02203608, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 1.6503283992832696, "language_loss": 0.80953014, "learning_rate": 2.711618346348102e-06, "loss": 0.83142495, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.617626190185547 }, { "auxiliary_loss_clip": 0.01153183, "auxiliary_loss_mlp": 0.01031329, "balance_loss_clip": 1.05446875, "balance_loss_mlp": 1.02366388, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 2.562961298300526, "language_loss": 0.63350964, "learning_rate": 2.7108902977152825e-06, "loss": 0.65535474, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.582167863845825 }, { "auxiliary_loss_clip": 0.01169044, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.05374706, "balance_loss_mlp": 1.02502418, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 2.602953884498685, "language_loss": 0.74741685, "learning_rate": 2.7101621412409704e-06, "loss": 0.76944375, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.602501153945923 }, { "auxiliary_loss_clip": 0.01186417, "auxiliary_loss_mlp": 0.01026525, "balance_loss_clip": 1.05664051, "balance_loss_mlp": 1.01819241, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 1.7932023289001278, "language_loss": 0.85906827, "learning_rate": 2.7094338770356256e-06, "loss": 0.88119769, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 2.5133907794952393 }, { "auxiliary_loss_clip": 0.01153097, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.05299795, "balance_loss_mlp": 1.02175784, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 2.1525940304014806, "language_loss": 0.64131194, "learning_rate": 2.708705505209726e-06, "loss": 0.66314554, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 2.646333694458008 }, { "auxiliary_loss_clip": 0.01121275, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.04793608, "balance_loss_mlp": 1.03017175, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 2.3315895273689895, "language_loss": 0.91552693, "learning_rate": 2.7079770258737646e-06, "loss": 0.93712306, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.663471221923828 }, { "auxiliary_loss_clip": 0.01138265, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.04902697, "balance_loss_mlp": 1.02328706, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 2.2391837588450767, "language_loss": 0.75504506, "learning_rate": 2.707248439138251e-06, "loss": 0.77675176, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.6243205070495605 }, { "auxiliary_loss_clip": 0.01154284, "auxiliary_loss_mlp": 0.01027423, "balance_loss_clip": 1.05685687, "balance_loss_mlp": 1.01915026, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 1.6865313690719599, "language_loss": 0.6512751, "learning_rate": 2.7065197451137114e-06, "loss": 0.67309225, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.6342687606811523 }, { "auxiliary_loss_clip": 0.01160386, "auxiliary_loss_mlp": 0.01037791, "balance_loss_clip": 1.05631387, "balance_loss_mlp": 1.02973199, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 1.8832571306180066, "language_loss": 0.67507762, "learning_rate": 2.7057909439106894e-06, "loss": 0.69705939, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 2.573958158493042 }, { "auxiliary_loss_clip": 0.01161151, "auxiliary_loss_mlp": 0.00761934, "balance_loss_clip": 1.05203533, "balance_loss_mlp": 1.00032747, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 2.6948036368492088, "language_loss": 0.78594089, "learning_rate": 2.7050620356397417e-06, "loss": 0.80517173, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.581328868865967 }, { "auxiliary_loss_clip": 0.01185296, "auxiliary_loss_mlp": 0.01033901, "balance_loss_clip": 1.05949938, "balance_loss_mlp": 1.02621472, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.6418431255661425, "language_loss": 0.72153109, "learning_rate": 2.7043330204114437e-06, "loss": 0.74372303, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.517836570739746 }, { "auxiliary_loss_clip": 0.01179626, "auxiliary_loss_mlp": 0.01022579, "balance_loss_clip": 1.05387998, "balance_loss_mlp": 1.01471734, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 2.2536660953669942, "language_loss": 0.8557322, "learning_rate": 2.7036038983363862e-06, "loss": 0.87775427, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.479858160018921 }, { "auxiliary_loss_clip": 0.01165493, "auxiliary_loss_mlp": 0.01028238, "balance_loss_clip": 1.05440688, "balance_loss_mlp": 1.02068579, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.8266279295811543, "language_loss": 0.84515607, "learning_rate": 2.702874669525177e-06, "loss": 0.86709332, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.5679666996002197 }, { "auxiliary_loss_clip": 0.01140642, "auxiliary_loss_mlp": 0.01026162, "balance_loss_clip": 1.05202353, "balance_loss_mlp": 1.01885188, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 1.8339099395275695, "language_loss": 0.69689, "learning_rate": 2.7021453340884394e-06, "loss": 0.71855801, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.6638176441192627 }, { "auxiliary_loss_clip": 0.01144957, "auxiliary_loss_mlp": 0.00762084, "balance_loss_clip": 1.05172431, "balance_loss_mlp": 1.00031936, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 2.0523532767642374, "language_loss": 0.72985411, "learning_rate": 2.7014158921368125e-06, "loss": 0.74892449, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 3.30672550201416 }, { "auxiliary_loss_clip": 0.01183736, "auxiliary_loss_mlp": 0.01033236, "balance_loss_clip": 1.05586767, "balance_loss_mlp": 1.02553499, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 1.7684909724831732, "language_loss": 0.85333055, "learning_rate": 2.700686343780953e-06, "loss": 0.87550026, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.5398807525634766 }, { "auxiliary_loss_clip": 0.01155766, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.05119228, "balance_loss_mlp": 1.0219934, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 2.090622306732791, "language_loss": 0.88286197, "learning_rate": 2.699956689131532e-06, "loss": 0.90471649, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 2.5826849937438965 }, { "auxiliary_loss_clip": 0.01156121, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.05171084, "balance_loss_mlp": 1.02319098, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 2.481856520410991, "language_loss": 0.84837413, "learning_rate": 2.699226928299238e-06, "loss": 0.870242, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 2.5970826148986816 }, { "auxiliary_loss_clip": 0.01168874, "auxiliary_loss_mlp": 0.01023436, "balance_loss_clip": 1.05396962, "balance_loss_mlp": 1.01584864, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.1316078681279684, "language_loss": 0.78825426, "learning_rate": 2.698497061394774e-06, "loss": 0.81017733, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.594989538192749 }, { "auxiliary_loss_clip": 0.01147468, "auxiliary_loss_mlp": 0.00761659, "balance_loss_clip": 1.05406976, "balance_loss_mlp": 1.00029325, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 1.6192981142479435, "language_loss": 0.80655795, "learning_rate": 2.6977670885288627e-06, "loss": 0.82564926, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 3.446361780166626 }, { "auxiliary_loss_clip": 0.01144108, "auxiliary_loss_mlp": 0.01030713, "balance_loss_clip": 1.04925954, "balance_loss_mlp": 1.02263618, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 2.290976709221643, "language_loss": 0.75061226, "learning_rate": 2.6970370098122378e-06, "loss": 0.77236044, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 3.3680992126464844 }, { "auxiliary_loss_clip": 0.01182233, "auxiliary_loss_mlp": 0.0102394, "balance_loss_clip": 1.05521059, "balance_loss_mlp": 1.01633418, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.4719927728825715, "language_loss": 0.86352313, "learning_rate": 2.6963068253556535e-06, "loss": 0.88558483, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.6271915435791016 }, { "auxiliary_loss_clip": 0.0117408, "auxiliary_loss_mlp": 0.01028726, "balance_loss_clip": 1.0530355, "balance_loss_mlp": 1.02022314, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 1.9416559640622753, "language_loss": 0.85777557, "learning_rate": 2.6955765352698763e-06, "loss": 0.8798036, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 2.6010568141937256 }, { "auxiliary_loss_clip": 0.01183495, "auxiliary_loss_mlp": 0.01032936, "balance_loss_clip": 1.05358016, "balance_loss_mlp": 1.02455008, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 10.242946906854163, "language_loss": 0.73306239, "learning_rate": 2.6948461396656923e-06, "loss": 0.75522673, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.5100531578063965 }, { "auxiliary_loss_clip": 0.01171231, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.05301082, "balance_loss_mlp": 1.02395439, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 2.4588589273752044, "language_loss": 0.74463207, "learning_rate": 2.6941156386539013e-06, "loss": 0.76666248, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 2.6117379665374756 }, { "auxiliary_loss_clip": 0.01154913, "auxiliary_loss_mlp": 0.01036591, "balance_loss_clip": 1.0575881, "balance_loss_mlp": 1.02832997, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 7.214921525087993, "language_loss": 0.81089365, "learning_rate": 2.6933850323453203e-06, "loss": 0.83280867, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.5762343406677246 }, { "auxiliary_loss_clip": 0.01184664, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.05804932, "balance_loss_mlp": 1.02194679, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 1.8213181567669736, "language_loss": 0.74542809, "learning_rate": 2.6926543208507806e-06, "loss": 0.76757365, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 2.527643918991089 }, { "auxiliary_loss_clip": 0.01171067, "auxiliary_loss_mlp": 0.01026178, "balance_loss_clip": 1.05571556, "balance_loss_mlp": 1.01831627, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 1.9899567321470668, "language_loss": 0.79851091, "learning_rate": 2.6919235042811316e-06, "loss": 0.82048339, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.526902675628662 }, { "auxiliary_loss_clip": 0.01135962, "auxiliary_loss_mlp": 0.0103283, "balance_loss_clip": 1.04816639, "balance_loss_mlp": 1.02508712, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 2.017413014360003, "language_loss": 0.76488125, "learning_rate": 2.691192582747237e-06, "loss": 0.78656918, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 2.615504264831543 }, { "auxiliary_loss_clip": 0.01181551, "auxiliary_loss_mlp": 0.01028946, "balance_loss_clip": 1.05399084, "balance_loss_mlp": 1.0206964, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 1.7520978756261272, "language_loss": 0.73949784, "learning_rate": 2.6904615563599765e-06, "loss": 0.76160282, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.5301167964935303 }, { "auxiliary_loss_clip": 0.01135039, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.04868615, "balance_loss_mlp": 1.01782, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 1.7204841176003094, "language_loss": 0.83438599, "learning_rate": 2.6897304252302477e-06, "loss": 0.85599709, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 2.5903613567352295 }, { "auxiliary_loss_clip": 0.01038265, "auxiliary_loss_mlp": 0.01005489, "balance_loss_clip": 1.01484084, "balance_loss_mlp": 1.00416553, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.785068124290811, "language_loss": 0.54838824, "learning_rate": 2.688999189468962e-06, "loss": 0.56882572, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 3.03806471824646 }, { "auxiliary_loss_clip": 0.01165651, "auxiliary_loss_mlp": 0.01026091, "balance_loss_clip": 1.05370843, "balance_loss_mlp": 1.01821089, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 2.9981274587368683, "language_loss": 0.75665265, "learning_rate": 2.6882678491870464e-06, "loss": 0.77857006, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.551246166229248 }, { "auxiliary_loss_clip": 0.01171999, "auxiliary_loss_mlp": 0.01027512, "balance_loss_clip": 1.05359077, "balance_loss_mlp": 1.0190835, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 1.868055768090866, "language_loss": 0.71176028, "learning_rate": 2.6875364044954453e-06, "loss": 0.73375535, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.5642755031585693 }, { "auxiliary_loss_clip": 0.01149895, "auxiliary_loss_mlp": 0.0102966, "balance_loss_clip": 1.04704177, "balance_loss_mlp": 1.02249002, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.604990727285972, "language_loss": 0.82226598, "learning_rate": 2.6868048555051185e-06, "loss": 0.84406149, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.623302698135376 }, { "auxiliary_loss_clip": 0.01159318, "auxiliary_loss_mlp": 0.01029995, "balance_loss_clip": 1.04959917, "balance_loss_mlp": 1.02158451, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 2.882999968564185, "language_loss": 0.85455835, "learning_rate": 2.686073202327041e-06, "loss": 0.87645149, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.6350769996643066 }, { "auxiliary_loss_clip": 0.01140327, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.04533112, "balance_loss_mlp": 1.01810622, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 2.1537236730235514, "language_loss": 0.73322797, "learning_rate": 2.6853414450722043e-06, "loss": 0.75488985, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.6010942459106445 }, { "auxiliary_loss_clip": 0.01165518, "auxiliary_loss_mlp": 0.01028665, "balance_loss_clip": 1.05228591, "balance_loss_mlp": 1.02088356, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 1.7702142282644202, "language_loss": 0.8534984, "learning_rate": 2.684609583851616e-06, "loss": 0.87544024, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.518399953842163 }, { "auxiliary_loss_clip": 0.01125395, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.04804468, "balance_loss_mlp": 1.02492714, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 1.6014395355827715, "language_loss": 0.80749857, "learning_rate": 2.683877618776297e-06, "loss": 0.82908267, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.7148594856262207 }, { "auxiliary_loss_clip": 0.01147772, "auxiliary_loss_mlp": 0.01038862, "balance_loss_clip": 1.04895973, "balance_loss_mlp": 1.03089917, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 1.9840971678146528, "language_loss": 0.73917007, "learning_rate": 2.6831455499572876e-06, "loss": 0.76103652, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.6191625595092773 }, { "auxiliary_loss_clip": 0.01180261, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 1.05270314, "balance_loss_mlp": 1.02314782, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 1.8940229924493592, "language_loss": 0.77698088, "learning_rate": 2.682413377505641e-06, "loss": 0.79909492, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 3.3619837760925293 }, { "auxiliary_loss_clip": 0.01168284, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.05195904, "balance_loss_mlp": 1.021909, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 1.767825487476804, "language_loss": 0.76737332, "learning_rate": 2.6816811015324284e-06, "loss": 0.789352, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.5624876022338867 }, { "auxiliary_loss_clip": 0.01073336, "auxiliary_loss_mlp": 0.01002426, "balance_loss_clip": 1.01714039, "balance_loss_mlp": 1.00111437, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.7242210274620713, "language_loss": 0.56799078, "learning_rate": 2.6809487221487343e-06, "loss": 0.58874846, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 3.0162415504455566 }, { "auxiliary_loss_clip": 0.01159245, "auxiliary_loss_mlp": 0.01028958, "balance_loss_clip": 1.05013514, "balance_loss_mlp": 1.02137637, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.3985611486461673, "language_loss": 0.81879199, "learning_rate": 2.6802162394656605e-06, "loss": 0.84067404, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.5482261180877686 }, { "auxiliary_loss_clip": 0.0114651, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.04619575, "balance_loss_mlp": 1.02471554, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 1.6658248518561067, "language_loss": 0.71785086, "learning_rate": 2.679483653594324e-06, "loss": 0.73964143, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 2.5991175174713135 }, { "auxiliary_loss_clip": 0.01169868, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.05210996, "balance_loss_mlp": 1.01804113, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 3.4762855647919446, "language_loss": 0.76505554, "learning_rate": 2.678750964645857e-06, "loss": 0.78701001, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 3.3758387565612793 }, { "auxiliary_loss_clip": 0.01168864, "auxiliary_loss_mlp": 0.01035714, "balance_loss_clip": 1.05544674, "balance_loss_mlp": 1.02759564, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 2.4430279004594273, "language_loss": 0.83579648, "learning_rate": 2.6780181727314094e-06, "loss": 0.85784227, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 3.260948657989502 }, { "auxiliary_loss_clip": 0.011448, "auxiliary_loss_mlp": 0.00761816, "balance_loss_clip": 1.05045485, "balance_loss_mlp": 1.00033236, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 1.7432751154832158, "language_loss": 0.78004408, "learning_rate": 2.6772852779621435e-06, "loss": 0.79911023, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.5779008865356445 }, { "auxiliary_loss_clip": 0.01161512, "auxiliary_loss_mlp": 0.00761111, "balance_loss_clip": 1.05363882, "balance_loss_mlp": 1.00033188, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 1.968634378698742, "language_loss": 0.85942686, "learning_rate": 2.676552280449239e-06, "loss": 0.87865317, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 2.5815048217773438 }, { "auxiliary_loss_clip": 0.01158546, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.05049431, "balance_loss_mlp": 1.0236789, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 3.3297550646975935, "language_loss": 0.75370276, "learning_rate": 2.6758191803038917e-06, "loss": 0.77560246, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 2.5032663345336914 }, { "auxiliary_loss_clip": 0.01105249, "auxiliary_loss_mlp": 0.01033071, "balance_loss_clip": 1.05013204, "balance_loss_mlp": 1.02508998, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 1.7441885885953403, "language_loss": 0.82493645, "learning_rate": 2.6750859776373125e-06, "loss": 0.84631968, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.6794519424438477 }, { "auxiliary_loss_clip": 0.01025883, "auxiliary_loss_mlp": 0.01001615, "balance_loss_clip": 1.0183444, "balance_loss_mlp": 1.00028598, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.7683290533082459, "language_loss": 0.60432839, "learning_rate": 2.674352672560727e-06, "loss": 0.62460339, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.2302324771881104 }, { "auxiliary_loss_clip": 0.01137777, "auxiliary_loss_mlp": 0.01032986, "balance_loss_clip": 1.04827857, "balance_loss_mlp": 1.0249033, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 1.6351414428473174, "language_loss": 0.77110922, "learning_rate": 2.673619265185377e-06, "loss": 0.79281682, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 2.6056385040283203 }, { "auxiliary_loss_clip": 0.01168916, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.0521996, "balance_loss_mlp": 1.02506959, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 2.235404804621795, "language_loss": 0.77915442, "learning_rate": 2.672885755622521e-06, "loss": 0.80118072, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.6092770099639893 }, { "auxiliary_loss_clip": 0.01125264, "auxiliary_loss_mlp": 0.01029257, "balance_loss_clip": 1.04774094, "balance_loss_mlp": 1.0218122, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 2.0628766122999025, "language_loss": 0.70550513, "learning_rate": 2.67215214398343e-06, "loss": 0.7270503, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.646634101867676 }, { "auxiliary_loss_clip": 0.01129305, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.04636335, "balance_loss_mlp": 1.02543211, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 2.651354143430776, "language_loss": 0.78181088, "learning_rate": 2.671418430379393e-06, "loss": 0.80344045, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.695010185241699 }, { "auxiliary_loss_clip": 0.01181754, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.05332351, "balance_loss_mlp": 1.02088976, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 1.85514036255539, "language_loss": 0.83606291, "learning_rate": 2.670684614921715e-06, "loss": 0.85816443, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 2.4935150146484375 }, { "auxiliary_loss_clip": 0.01155089, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.04949236, "balance_loss_mlp": 1.02073383, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 2.166322051332972, "language_loss": 0.695768, "learning_rate": 2.6699506977217128e-06, "loss": 0.71760648, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 2.5986838340759277 }, { "auxiliary_loss_clip": 0.01168335, "auxiliary_loss_mlp": 0.01028656, "balance_loss_clip": 1.05620766, "balance_loss_mlp": 1.02059197, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 2.1728493726966875, "language_loss": 0.70028335, "learning_rate": 2.6692166788907233e-06, "loss": 0.72225332, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.5935885906219482 }, { "auxiliary_loss_clip": 0.0115437, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.05020642, "balance_loss_mlp": 1.02062893, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 2.292325951293512, "language_loss": 0.76727957, "learning_rate": 2.6684825585400957e-06, "loss": 0.78911221, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.578110694885254 }, { "auxiliary_loss_clip": 0.01049469, "auxiliary_loss_mlp": 0.01001344, "balance_loss_clip": 1.01717615, "balance_loss_mlp": 1.00000846, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.813098794828117, "language_loss": 0.6513896, "learning_rate": 2.6677483367811947e-06, "loss": 0.67189777, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.2548367977142334 }, { "auxiliary_loss_clip": 0.0116922, "auxiliary_loss_mlp": 0.01025279, "balance_loss_clip": 1.05035162, "balance_loss_mlp": 1.01785874, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 1.6520280283916442, "language_loss": 0.75545847, "learning_rate": 2.6670140137254028e-06, "loss": 0.77740347, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.548207998275757 }, { "auxiliary_loss_clip": 0.0112179, "auxiliary_loss_mlp": 0.01032324, "balance_loss_clip": 1.04684377, "balance_loss_mlp": 1.02424157, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 2.2265577402915806, "language_loss": 0.89837545, "learning_rate": 2.666279589484115e-06, "loss": 0.91991663, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.6030380725860596 }, { "auxiliary_loss_clip": 0.01125636, "auxiliary_loss_mlp": 0.01030207, "balance_loss_clip": 1.04774678, "balance_loss_mlp": 1.02251852, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 1.8960066646899574, "language_loss": 0.8108139, "learning_rate": 2.6655450641687435e-06, "loss": 0.83237237, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.6661558151245117 }, { "auxiliary_loss_clip": 0.01180761, "auxiliary_loss_mlp": 0.01032541, "balance_loss_clip": 1.0547055, "balance_loss_mlp": 1.02477777, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 1.7179658132574744, "language_loss": 0.69332796, "learning_rate": 2.664810437890715e-06, "loss": 0.71546102, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.5865578651428223 }, { "auxiliary_loss_clip": 0.01105703, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.05055022, "balance_loss_mlp": 1.01895809, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 2.8751312955359434, "language_loss": 0.79619133, "learning_rate": 2.6640757107614714e-06, "loss": 0.81751835, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 3.4741129875183105 }, { "auxiliary_loss_clip": 0.01135976, "auxiliary_loss_mlp": 0.01027421, "balance_loss_clip": 1.05100405, "balance_loss_mlp": 1.01917768, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.427575559369949, "language_loss": 0.69315577, "learning_rate": 2.6633408828924697e-06, "loss": 0.71478975, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 2.6957147121429443 }, { "auxiliary_loss_clip": 0.01146431, "auxiliary_loss_mlp": 0.01029008, "balance_loss_clip": 1.05039692, "balance_loss_mlp": 1.02143216, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.6456039108239797, "language_loss": 0.69944888, "learning_rate": 2.662605954395185e-06, "loss": 0.72120333, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.6323070526123047 }, { "auxiliary_loss_clip": 0.01170873, "auxiliary_loss_mlp": 0.01027331, "balance_loss_clip": 1.05234885, "balance_loss_mlp": 1.01956999, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 1.7408522727418236, "language_loss": 0.83634198, "learning_rate": 2.6618709253811027e-06, "loss": 0.85832399, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 2.6192917823791504 }, { "auxiliary_loss_clip": 0.01178783, "auxiliary_loss_mlp": 0.01031507, "balance_loss_clip": 1.05532014, "balance_loss_mlp": 1.02449441, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 1.6071080935468598, "language_loss": 0.8743031, "learning_rate": 2.6611357959617277e-06, "loss": 0.89640599, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 2.498661756515503 }, { "auxiliary_loss_clip": 0.01135622, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.04992771, "balance_loss_mlp": 1.02506387, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 1.8122977002664584, "language_loss": 0.90906507, "learning_rate": 2.660400566248578e-06, "loss": 0.93075019, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.58943510055542 }, { "auxiliary_loss_clip": 0.01141517, "auxiliary_loss_mlp": 0.01032264, "balance_loss_clip": 1.05065358, "balance_loss_mlp": 1.02350235, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 2.368406350496862, "language_loss": 0.66832113, "learning_rate": 2.6596652363531876e-06, "loss": 0.69005889, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 4.046716213226318 }, { "auxiliary_loss_clip": 0.01182424, "auxiliary_loss_mlp": 0.01033781, "balance_loss_clip": 1.05522251, "balance_loss_mlp": 1.02579403, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 1.480694397912182, "language_loss": 0.78417319, "learning_rate": 2.6589298063871055e-06, "loss": 0.80633533, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 3.371730327606201 }, { "auxiliary_loss_clip": 0.0118281, "auxiliary_loss_mlp": 0.01032287, "balance_loss_clip": 1.05589414, "balance_loss_mlp": 1.0248487, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 1.8936669088921665, "language_loss": 0.69675469, "learning_rate": 2.658194276461895e-06, "loss": 0.71890569, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 2.47825026512146 }, { "auxiliary_loss_clip": 0.01152002, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.04730034, "balance_loss_mlp": 1.02859879, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 2.1392674876623525, "language_loss": 0.66915703, "learning_rate": 2.6574586466891368e-06, "loss": 0.69104809, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 2.6013925075531006 }, { "auxiliary_loss_clip": 0.01155362, "auxiliary_loss_mlp": 0.00761125, "balance_loss_clip": 1.05198097, "balance_loss_mlp": 1.00028467, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 2.4563766766560162, "language_loss": 0.64842403, "learning_rate": 2.6567229171804247e-06, "loss": 0.66758895, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 2.636162757873535 }, { "auxiliary_loss_clip": 0.01146535, "auxiliary_loss_mlp": 0.01026377, "balance_loss_clip": 1.04787183, "balance_loss_mlp": 1.01833034, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 2.7046510782254227, "language_loss": 0.87705612, "learning_rate": 2.655987088047368e-06, "loss": 0.89878523, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.5636308193206787 }, { "auxiliary_loss_clip": 0.01151322, "auxiliary_loss_mlp": 0.01036911, "balance_loss_clip": 1.0504638, "balance_loss_mlp": 1.02848268, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 1.8845944029501807, "language_loss": 0.78399289, "learning_rate": 2.6552511594015912e-06, "loss": 0.80587524, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.6142683029174805 }, { "auxiliary_loss_clip": 0.01151017, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.04751325, "balance_loss_mlp": 1.02407634, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 1.8713548410316012, "language_loss": 0.85217512, "learning_rate": 2.654515131354735e-06, "loss": 0.87401104, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.568700075149536 }, { "auxiliary_loss_clip": 0.01141485, "auxiliary_loss_mlp": 0.01031974, "balance_loss_clip": 1.05004728, "balance_loss_mlp": 1.02427292, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 1.9298165493875126, "language_loss": 0.84699249, "learning_rate": 2.653779004018453e-06, "loss": 0.86872709, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 2.653779983520508 }, { "auxiliary_loss_clip": 0.01144699, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.04883552, "balance_loss_mlp": 1.01894331, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 1.8256237136121538, "language_loss": 0.82256877, "learning_rate": 2.653042777504417e-06, "loss": 0.84428167, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.623439311981201 }, { "auxiliary_loss_clip": 0.01158631, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.05021822, "balance_loss_mlp": 1.0186944, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 1.8267891625611843, "language_loss": 0.80079323, "learning_rate": 2.6523064519243105e-06, "loss": 0.82265055, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.6495444774627686 }, { "auxiliary_loss_clip": 0.0117005, "auxiliary_loss_mlp": 0.01034109, "balance_loss_clip": 1.05569482, "balance_loss_mlp": 1.02588964, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 2.261948954856196, "language_loss": 0.78979701, "learning_rate": 2.6515700273898333e-06, "loss": 0.81183863, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 2.5409739017486572 }, { "auxiliary_loss_clip": 0.01142244, "auxiliary_loss_mlp": 0.01029514, "balance_loss_clip": 1.05063224, "balance_loss_mlp": 1.02145541, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 2.0623814743776467, "language_loss": 0.69585121, "learning_rate": 2.6508335040127018e-06, "loss": 0.71756876, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 2.581613063812256 }, { "auxiliary_loss_clip": 0.01171463, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.05363977, "balance_loss_mlp": 1.02460635, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 1.4727386581581858, "language_loss": 0.77011347, "learning_rate": 2.6500968819046446e-06, "loss": 0.7921564, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.569923162460327 }, { "auxiliary_loss_clip": 0.01128548, "auxiliary_loss_mlp": 0.01028175, "balance_loss_clip": 1.0458951, "balance_loss_mlp": 1.02039683, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 2.504284595247858, "language_loss": 0.59606034, "learning_rate": 2.649360161177408e-06, "loss": 0.6176275, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.561326265335083 }, { "auxiliary_loss_clip": 0.01173763, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.05361009, "balance_loss_mlp": 1.02422094, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 1.8528976360958294, "language_loss": 0.73316234, "learning_rate": 2.6486233419427504e-06, "loss": 0.75522923, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 2.553699493408203 }, { "auxiliary_loss_clip": 0.01133936, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.05167341, "balance_loss_mlp": 1.02637482, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 3.144277523459835, "language_loss": 0.75374877, "learning_rate": 2.6478864243124484e-06, "loss": 0.77543914, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.6695733070373535 }, { "auxiliary_loss_clip": 0.01165683, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04940426, "balance_loss_mlp": 1.01985729, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 1.8250696474316512, "language_loss": 0.85167813, "learning_rate": 2.6471494083982903e-06, "loss": 0.87361312, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.545546770095825 }, { "auxiliary_loss_clip": 0.01141532, "auxiliary_loss_mlp": 0.0103275, "balance_loss_clip": 1.04746354, "balance_loss_mlp": 1.02453613, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 1.6652216992311457, "language_loss": 0.74851817, "learning_rate": 2.6464122943120818e-06, "loss": 0.77026105, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.678117275238037 }, { "auxiliary_loss_clip": 0.01139521, "auxiliary_loss_mlp": 0.01031753, "balance_loss_clip": 1.05001605, "balance_loss_mlp": 1.02358139, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 2.9304441451396706, "language_loss": 0.82671869, "learning_rate": 2.645675082165642e-06, "loss": 0.84843147, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.606657028198242 }, { "auxiliary_loss_clip": 0.01156303, "auxiliary_loss_mlp": 0.01033522, "balance_loss_clip": 1.05303693, "balance_loss_mlp": 1.02465868, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 2.2101939143279252, "language_loss": 0.76053929, "learning_rate": 2.644937772070806e-06, "loss": 0.7824375, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.6155617237091064 }, { "auxiliary_loss_clip": 0.01185192, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.05727005, "balance_loss_mlp": 1.01993537, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 2.4516476655570907, "language_loss": 0.83773589, "learning_rate": 2.6442003641394225e-06, "loss": 0.85986638, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 3.339801788330078 }, { "auxiliary_loss_clip": 0.01151633, "auxiliary_loss_mlp": 0.01028226, "balance_loss_clip": 1.04970551, "balance_loss_mlp": 1.01990783, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.4325786358270425, "language_loss": 0.83850074, "learning_rate": 2.643462858483356e-06, "loss": 0.86029941, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.6005334854125977 }, { "auxiliary_loss_clip": 0.01118911, "auxiliary_loss_mlp": 0.0103159, "balance_loss_clip": 1.04484963, "balance_loss_mlp": 1.02343619, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 1.9826697309395884, "language_loss": 0.72459543, "learning_rate": 2.6427252552144856e-06, "loss": 0.74610043, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 2.6547365188598633 }, { "auxiliary_loss_clip": 0.01181277, "auxiliary_loss_mlp": 0.01031916, "balance_loss_clip": 1.05440044, "balance_loss_mlp": 1.02377415, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 1.7617620966457528, "language_loss": 0.74850881, "learning_rate": 2.6419875544447044e-06, "loss": 0.77064079, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 2.526935577392578 }, { "auxiliary_loss_clip": 0.01182776, "auxiliary_loss_mlp": 0.01030256, "balance_loss_clip": 1.05367172, "balance_loss_mlp": 1.02220941, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 1.6132265488720483, "language_loss": 0.71497381, "learning_rate": 2.6412497562859218e-06, "loss": 0.73710418, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 2.556248903274536 }, { "auxiliary_loss_clip": 0.01171414, "auxiliary_loss_mlp": 0.0102513, "balance_loss_clip": 1.05193639, "balance_loss_mlp": 1.01655889, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 2.3736010997847616, "language_loss": 0.76423168, "learning_rate": 2.6405118608500617e-06, "loss": 0.78619707, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 4.064232110977173 }, { "auxiliary_loss_clip": 0.01138995, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 1.05521131, "balance_loss_mlp": 1.02144599, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 1.6611064703982372, "language_loss": 0.8133899, "learning_rate": 2.6397738682490613e-06, "loss": 0.83506858, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 3.335458755493164 }, { "auxiliary_loss_clip": 0.01184088, "auxiliary_loss_mlp": 0.01030723, "balance_loss_clip": 1.05507481, "balance_loss_mlp": 1.02244091, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 1.6480215596762928, "language_loss": 0.75400931, "learning_rate": 2.6390357785948734e-06, "loss": 0.77615744, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.4961304664611816 }, { "auxiliary_loss_clip": 0.01168321, "auxiliary_loss_mlp": 0.0103384, "balance_loss_clip": 1.0540216, "balance_loss_mlp": 1.02569795, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 1.7699780354864054, "language_loss": 0.80116099, "learning_rate": 2.6382975919994667e-06, "loss": 0.82318264, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 2.575606107711792 }, { "auxiliary_loss_clip": 0.01154662, "auxiliary_loss_mlp": 0.01026599, "balance_loss_clip": 1.05027628, "balance_loss_mlp": 1.01942277, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 1.70684902275972, "language_loss": 0.72987962, "learning_rate": 2.637559308574822e-06, "loss": 0.7516923, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 2.610459804534912 }, { "auxiliary_loss_clip": 0.01181729, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.05361068, "balance_loss_mlp": 1.02242935, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 1.9380554810582133, "language_loss": 0.71125299, "learning_rate": 2.6368209284329376e-06, "loss": 0.73337674, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.5619311332702637 }, { "auxiliary_loss_clip": 0.01166845, "auxiliary_loss_mlp": 0.01030221, "balance_loss_clip": 1.05098128, "balance_loss_mlp": 1.02197766, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 2.1583686447481236, "language_loss": 0.75415826, "learning_rate": 2.636082451685825e-06, "loss": 0.77612889, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.566957712173462 }, { "auxiliary_loss_clip": 0.01160149, "auxiliary_loss_mlp": 0.01027672, "balance_loss_clip": 1.05565333, "balance_loss_mlp": 1.01941681, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 1.7363858552840825, "language_loss": 0.86402082, "learning_rate": 2.6353438784455094e-06, "loss": 0.88589901, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.6165599822998047 }, { "auxiliary_loss_clip": 0.01154627, "auxiliary_loss_mlp": 0.01037755, "balance_loss_clip": 1.05469012, "balance_loss_mlp": 1.02868366, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 2.2414319328868397, "language_loss": 0.71548033, "learning_rate": 2.6346052088240326e-06, "loss": 0.73740411, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.6240410804748535 }, { "auxiliary_loss_clip": 0.01159684, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.05425763, "balance_loss_mlp": 1.02207673, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 1.9504896745296996, "language_loss": 0.76847076, "learning_rate": 2.63386644293345e-06, "loss": 0.79037011, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 2.537118911743164 }, { "auxiliary_loss_clip": 0.01135179, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.04720569, "balance_loss_mlp": 1.01832438, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 2.900292987531332, "language_loss": 0.83382159, "learning_rate": 2.633127580885833e-06, "loss": 0.8554318, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.5883262157440186 }, { "auxiliary_loss_clip": 0.01185709, "auxiliary_loss_mlp": 0.01030305, "balance_loss_clip": 1.05877113, "balance_loss_mlp": 1.02221644, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 2.2703279373711283, "language_loss": 0.64985311, "learning_rate": 2.632388622793265e-06, "loss": 0.67201316, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 2.5698139667510986 }, { "auxiliary_loss_clip": 0.01170143, "auxiliary_loss_mlp": 0.01034424, "balance_loss_clip": 1.0548532, "balance_loss_mlp": 1.0264132, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 1.654294166425432, "language_loss": 0.67915273, "learning_rate": 2.6316495687678457e-06, "loss": 0.7011984, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.540938138961792 }, { "auxiliary_loss_clip": 0.01121298, "auxiliary_loss_mlp": 0.0102994, "balance_loss_clip": 1.04912543, "balance_loss_mlp": 1.02199483, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 2.623476452610566, "language_loss": 0.76948071, "learning_rate": 2.6309104189216887e-06, "loss": 0.79099309, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 2.634948253631592 }, { "auxiliary_loss_clip": 0.01128404, "auxiliary_loss_mlp": 0.00762211, "balance_loss_clip": 1.04791546, "balance_loss_mlp": 1.00025272, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.0711615878356304, "language_loss": 0.74809933, "learning_rate": 2.630171173366923e-06, "loss": 0.7670055, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.617377519607544 }, { "auxiliary_loss_clip": 0.01127003, "auxiliary_loss_mlp": 0.01033806, "balance_loss_clip": 1.04962957, "balance_loss_mlp": 1.02555966, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 2.4789310500593724, "language_loss": 0.74332392, "learning_rate": 2.629431832215691e-06, "loss": 0.76493204, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.6102709770202637 }, { "auxiliary_loss_clip": 0.01153952, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.05398345, "balance_loss_mlp": 1.02434635, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.6431718563824094, "language_loss": 0.87276089, "learning_rate": 2.628692395580151e-06, "loss": 0.89462852, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.604914665222168 }, { "auxiliary_loss_clip": 0.01095035, "auxiliary_loss_mlp": 0.01033576, "balance_loss_clip": 1.04540634, "balance_loss_mlp": 1.02517748, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 1.591792127577921, "language_loss": 0.79156715, "learning_rate": 2.6279528635724747e-06, "loss": 0.81285328, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.7355947494506836 }, { "auxiliary_loss_clip": 0.01169871, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.05279434, "balance_loss_mlp": 1.02390468, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 2.656959849512158, "language_loss": 0.78371286, "learning_rate": 2.627213236304848e-06, "loss": 0.80573821, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.51753568649292 }, { "auxiliary_loss_clip": 0.01175876, "auxiliary_loss_mlp": 0.01027782, "balance_loss_clip": 1.0562427, "balance_loss_mlp": 1.02003884, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 1.873316136810797, "language_loss": 0.70301902, "learning_rate": 2.626473513889472e-06, "loss": 0.72505558, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.6766517162323 }, { "auxiliary_loss_clip": 0.01162778, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 1.05353117, "balance_loss_mlp": 1.02315903, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 1.7865354090889305, "language_loss": 0.83087444, "learning_rate": 2.625733696438562e-06, "loss": 0.85281456, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.5560686588287354 }, { "auxiliary_loss_clip": 0.01155361, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.05329633, "balance_loss_mlp": 1.02683794, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 1.7444870414047702, "language_loss": 0.75122786, "learning_rate": 2.6249937840643476e-06, "loss": 0.77313226, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 3.3607144355773926 }, { "auxiliary_loss_clip": 0.01186527, "auxiliary_loss_mlp": 0.00761736, "balance_loss_clip": 1.05982018, "balance_loss_mlp": 1.00022542, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 1.6130800200153865, "language_loss": 0.66452098, "learning_rate": 2.6242537768790733e-06, "loss": 0.68400359, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.513181447982788 }, { "auxiliary_loss_clip": 0.01172843, "auxiliary_loss_mlp": 0.01026193, "balance_loss_clip": 1.05788684, "balance_loss_mlp": 1.0179143, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 4.733326937474909, "language_loss": 0.68618578, "learning_rate": 2.6235136749949975e-06, "loss": 0.70817614, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 2.650731325149536 }, { "auxiliary_loss_clip": 0.01183728, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.05621696, "balance_loss_mlp": 1.02180791, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 2.0540906831097465, "language_loss": 0.61274153, "learning_rate": 2.6227734785243924e-06, "loss": 0.63488036, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.624473810195923 }, { "auxiliary_loss_clip": 0.01105833, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.04472494, "balance_loss_mlp": 1.02329087, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 1.9665957482200438, "language_loss": 0.78995383, "learning_rate": 2.6220331875795466e-06, "loss": 0.81132805, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 2.701052188873291 }, { "auxiliary_loss_clip": 0.01167029, "auxiliary_loss_mlp": 0.01032461, "balance_loss_clip": 1.05584526, "balance_loss_mlp": 1.02390742, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.594105456538369, "language_loss": 0.75399929, "learning_rate": 2.62129280227276e-06, "loss": 0.77599418, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 4.912471771240234 }, { "auxiliary_loss_clip": 0.01178099, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.05865717, "balance_loss_mlp": 1.02732658, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 2.2130894016469345, "language_loss": 0.68342423, "learning_rate": 2.62055232271635e-06, "loss": 0.70556021, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 2.9343502521514893 }, { "auxiliary_loss_clip": 0.01132628, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.04887128, "balance_loss_mlp": 1.02358627, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 2.182897822566128, "language_loss": 0.88092399, "learning_rate": 2.619811749022646e-06, "loss": 0.90256941, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.5923590660095215 }, { "auxiliary_loss_clip": 0.01175466, "auxiliary_loss_mlp": 0.01033905, "balance_loss_clip": 1.058864, "balance_loss_mlp": 1.02514935, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.2572486798842406, "language_loss": 0.71336657, "learning_rate": 2.6190710813039917e-06, "loss": 0.73546028, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 2.491145133972168 }, { "auxiliary_loss_clip": 0.01119264, "auxiliary_loss_mlp": 0.00762643, "balance_loss_clip": 1.04452884, "balance_loss_mlp": 1.00023532, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 2.988858590721069, "language_loss": 0.83980376, "learning_rate": 2.618330319672747e-06, "loss": 0.85862279, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.6449849605560303 }, { "auxiliary_loss_clip": 0.01188608, "auxiliary_loss_mlp": 0.01031737, "balance_loss_clip": 1.05953002, "balance_loss_mlp": 1.02382731, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.1851191262485927, "language_loss": 0.92041111, "learning_rate": 2.617589464241284e-06, "loss": 0.94261456, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 2.487933397293091 }, { "auxiliary_loss_clip": 0.01149139, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.05523038, "balance_loss_mlp": 1.02556157, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 1.9018786591088341, "language_loss": 0.74397212, "learning_rate": 2.6168485151219914e-06, "loss": 0.76579201, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.643768072128296 }, { "auxiliary_loss_clip": 0.01173965, "auxiliary_loss_mlp": 0.01031619, "balance_loss_clip": 1.05908835, "balance_loss_mlp": 1.02348852, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 2.119472793802139, "language_loss": 0.71363485, "learning_rate": 2.616107472427269e-06, "loss": 0.73569071, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.514937400817871 }, { "auxiliary_loss_clip": 0.01174268, "auxiliary_loss_mlp": 0.01029563, "balance_loss_clip": 1.05443716, "balance_loss_mlp": 1.02110553, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 2.541750093137917, "language_loss": 0.76056945, "learning_rate": 2.615366336269533e-06, "loss": 0.78260779, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.5297300815582275 }, { "auxiliary_loss_clip": 0.01189484, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.05830002, "balance_loss_mlp": 1.02562976, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.4254761477534448, "language_loss": 0.80722034, "learning_rate": 2.6146251067612126e-06, "loss": 0.82945919, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.480426788330078 }, { "auxiliary_loss_clip": 0.01170415, "auxiliary_loss_mlp": 0.01032132, "balance_loss_clip": 1.05801404, "balance_loss_mlp": 1.02410364, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 1.8846427411514612, "language_loss": 0.82556105, "learning_rate": 2.6138837840147525e-06, "loss": 0.84758651, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 2.537447690963745 }, { "auxiliary_loss_clip": 0.01140685, "auxiliary_loss_mlp": 0.01032867, "balance_loss_clip": 1.05119205, "balance_loss_mlp": 1.02380729, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 2.174100577432015, "language_loss": 0.76577389, "learning_rate": 2.6131423681426103e-06, "loss": 0.78750932, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.5713999271392822 }, { "auxiliary_loss_clip": 0.01184262, "auxiliary_loss_mlp": 0.01034233, "balance_loss_clip": 1.05742574, "balance_loss_mlp": 1.0263114, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 1.7667775902007472, "language_loss": 0.72807598, "learning_rate": 2.6124008592572587e-06, "loss": 0.75026095, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.682039499282837 }, { "auxiliary_loss_clip": 0.0118785, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.05694234, "balance_loss_mlp": 1.02223945, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 2.4692633686802505, "language_loss": 0.81359011, "learning_rate": 2.6116592574711835e-06, "loss": 0.83577847, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.5075910091400146 }, { "auxiliary_loss_clip": 0.01187857, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.0577904, "balance_loss_mlp": 1.0260365, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 1.8064727324105714, "language_loss": 0.83826619, "learning_rate": 2.6109175628968853e-06, "loss": 0.86049151, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 2.5484867095947266 }, { "auxiliary_loss_clip": 0.01159941, "auxiliary_loss_mlp": 0.0102829, "balance_loss_clip": 1.05244231, "balance_loss_mlp": 1.02010012, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 1.90220184249877, "language_loss": 0.82511795, "learning_rate": 2.610175775646878e-06, "loss": 0.84700024, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.5615739822387695 }, { "auxiliary_loss_clip": 0.01155452, "auxiliary_loss_mlp": 0.01035546, "balance_loss_clip": 1.05297065, "balance_loss_mlp": 1.02715969, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 1.885030405023015, "language_loss": 0.73065513, "learning_rate": 2.6094338958336907e-06, "loss": 0.75256515, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 2.6210906505584717 }, { "auxiliary_loss_clip": 0.01158059, "auxiliary_loss_mlp": 0.01030089, "balance_loss_clip": 1.05613291, "balance_loss_mlp": 1.02273703, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 1.9715129278974122, "language_loss": 0.81925255, "learning_rate": 2.608691923569867e-06, "loss": 0.84113407, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.5470285415649414 }, { "auxiliary_loss_clip": 0.01176651, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.0582782, "balance_loss_mlp": 1.02935886, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.664511613313216, "language_loss": 0.7576685, "learning_rate": 2.6079498589679616e-06, "loss": 0.77981317, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.5672545433044434 }, { "auxiliary_loss_clip": 0.01111322, "auxiliary_loss_mlp": 0.0103357, "balance_loss_clip": 1.04607856, "balance_loss_mlp": 1.02499902, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 1.953089129942015, "language_loss": 0.75910807, "learning_rate": 2.6072077021405465e-06, "loss": 0.78055704, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.7050507068634033 }, { "auxiliary_loss_clip": 0.0114834, "auxiliary_loss_mlp": 0.01030982, "balance_loss_clip": 1.05014682, "balance_loss_mlp": 1.02346635, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 1.8424427335420928, "language_loss": 0.69392455, "learning_rate": 2.6064654532002054e-06, "loss": 0.71571773, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.614002227783203 }, { "auxiliary_loss_clip": 0.01187712, "auxiliary_loss_mlp": 0.01033794, "balance_loss_clip": 1.0589813, "balance_loss_mlp": 1.02616167, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 3.262968948774268, "language_loss": 0.75843024, "learning_rate": 2.6057231122595375e-06, "loss": 0.78064537, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 3.445108413696289 }, { "auxiliary_loss_clip": 0.01158407, "auxiliary_loss_mlp": 0.01037653, "balance_loss_clip": 1.05245996, "balance_loss_mlp": 1.0295769, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.6243008911880297, "language_loss": 0.72834587, "learning_rate": 2.604980679431154e-06, "loss": 0.75030649, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 2.5698959827423096 }, { "auxiliary_loss_clip": 0.01172373, "auxiliary_loss_mlp": 0.01027361, "balance_loss_clip": 1.05228698, "balance_loss_mlp": 1.01926684, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 2.440048805860495, "language_loss": 0.74466449, "learning_rate": 2.604238154827684e-06, "loss": 0.76666188, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.5141279697418213 }, { "auxiliary_loss_clip": 0.01174765, "auxiliary_loss_mlp": 0.01026363, "balance_loss_clip": 1.05624902, "balance_loss_mlp": 1.0192225, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 1.8336332340323391, "language_loss": 0.72966468, "learning_rate": 2.6034955385617656e-06, "loss": 0.75167596, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.5247223377227783 }, { "auxiliary_loss_clip": 0.01047883, "auxiliary_loss_mlp": 0.01005175, "balance_loss_clip": 1.02145767, "balance_loss_mlp": 1.00387537, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.7282328108378927, "language_loss": 0.61719263, "learning_rate": 2.6027528307460544e-06, "loss": 0.63772321, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 3.25300669670105 }, { "auxiliary_loss_clip": 0.01187341, "auxiliary_loss_mlp": 0.01025224, "balance_loss_clip": 1.057621, "balance_loss_mlp": 1.01717734, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 1.943516556399264, "language_loss": 0.86929882, "learning_rate": 2.602010031493217e-06, "loss": 0.89142448, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 4.772298336029053 }, { "auxiliary_loss_clip": 0.0113765, "auxiliary_loss_mlp": 0.01028492, "balance_loss_clip": 1.04993105, "balance_loss_mlp": 1.0204339, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 1.8907431993242858, "language_loss": 0.87278438, "learning_rate": 2.6012671409159367e-06, "loss": 0.89444578, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.670034646987915 }, { "auxiliary_loss_clip": 0.01152741, "auxiliary_loss_mlp": 0.01024682, "balance_loss_clip": 1.05315411, "balance_loss_mlp": 1.01617694, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 1.5983131032612379, "language_loss": 0.81877744, "learning_rate": 2.6005241591269097e-06, "loss": 0.84055167, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 2.626570463180542 }, { "auxiliary_loss_clip": 0.01137967, "auxiliary_loss_mlp": 0.01032174, "balance_loss_clip": 1.05382907, "balance_loss_mlp": 1.02468491, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 1.8057243406085994, "language_loss": 0.79689336, "learning_rate": 2.5997810862388454e-06, "loss": 0.81859475, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 2.6550354957580566 }, { "auxiliary_loss_clip": 0.01154609, "auxiliary_loss_mlp": 0.01031868, "balance_loss_clip": 1.04993463, "balance_loss_mlp": 1.02337408, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 2.559728587930794, "language_loss": 0.75832236, "learning_rate": 2.599037922364467e-06, "loss": 0.78018713, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.62187123298645 }, { "auxiliary_loss_clip": 0.01135832, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.05138969, "balance_loss_mlp": 1.02121198, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.506327299717238, "language_loss": 0.75491107, "learning_rate": 2.5982946676165112e-06, "loss": 0.77655768, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.662526845932007 }, { "auxiliary_loss_clip": 0.01041473, "auxiliary_loss_mlp": 0.01005379, "balance_loss_clip": 1.02087712, "balance_loss_mlp": 1.00402021, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7275168824376893, "language_loss": 0.57686812, "learning_rate": 2.5975513221077313e-06, "loss": 0.59733665, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.246126174926758 }, { "auxiliary_loss_clip": 0.01148034, "auxiliary_loss_mlp": 0.01032902, "balance_loss_clip": 1.05052674, "balance_loss_mlp": 1.02480757, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 2.230404719719764, "language_loss": 0.88608241, "learning_rate": 2.5968078859508897e-06, "loss": 0.90789175, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.576901435852051 }, { "auxiliary_loss_clip": 0.01171157, "auxiliary_loss_mlp": 0.01027064, "balance_loss_clip": 1.05472362, "balance_loss_mlp": 1.01921988, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 1.8264308526132738, "language_loss": 0.79725635, "learning_rate": 2.5960643592587673e-06, "loss": 0.8192386, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.5058271884918213 }, { "auxiliary_loss_clip": 0.01142186, "auxiliary_loss_mlp": 0.010333, "balance_loss_clip": 1.04925752, "balance_loss_mlp": 1.02517009, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 1.8339397834681952, "language_loss": 0.81494701, "learning_rate": 2.5953207421441553e-06, "loss": 0.83670187, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.602940559387207 }, { "auxiliary_loss_clip": 0.0114339, "auxiliary_loss_mlp": 0.01028844, "balance_loss_clip": 1.0502212, "balance_loss_mlp": 1.02098179, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.241868976252056, "language_loss": 0.75179935, "learning_rate": 2.5945770347198603e-06, "loss": 0.77352172, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.594249725341797 }, { "auxiliary_loss_clip": 0.01147995, "auxiliary_loss_mlp": 0.01029242, "balance_loss_clip": 1.04844332, "balance_loss_mlp": 1.02160621, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 1.7627605661178114, "language_loss": 0.81732285, "learning_rate": 2.593833237098701e-06, "loss": 0.83909523, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 2.566551446914673 }, { "auxiliary_loss_clip": 0.01167089, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.0495038, "balance_loss_mlp": 1.02370369, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 2.451371246360345, "language_loss": 0.62634528, "learning_rate": 2.593089349393512e-06, "loss": 0.64834332, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.5852394104003906 }, { "auxiliary_loss_clip": 0.01167049, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.05325079, "balance_loss_mlp": 1.02946639, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 2.4401091612846306, "language_loss": 0.83643866, "learning_rate": 2.592345371717141e-06, "loss": 0.85848272, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.542020559310913 }, { "auxiliary_loss_clip": 0.01172118, "auxiliary_loss_mlp": 0.01033573, "balance_loss_clip": 1.05724168, "balance_loss_mlp": 1.02489424, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.021320555818237, "language_loss": 0.72221094, "learning_rate": 2.591601304182448e-06, "loss": 0.74426794, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.49338960647583 }, { "auxiliary_loss_clip": 0.01157131, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 1.05381536, "balance_loss_mlp": 1.0211463, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 1.9811853750962924, "language_loss": 0.79165322, "learning_rate": 2.5908571469023067e-06, "loss": 0.81351352, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.5865509510040283 }, { "auxiliary_loss_clip": 0.01183031, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.0547049, "balance_loss_mlp": 1.02394414, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 2.454645099029267, "language_loss": 0.75859642, "learning_rate": 2.5901128999896067e-06, "loss": 0.78075165, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.444157838821411 }, { "auxiliary_loss_clip": 0.01168148, "auxiliary_loss_mlp": 0.01029988, "balance_loss_clip": 1.05263448, "balance_loss_mlp": 1.02181602, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 1.5885827255514073, "language_loss": 0.68141115, "learning_rate": 2.5893685635572487e-06, "loss": 0.70339251, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.5777931213378906 }, { "auxiliary_loss_clip": 0.01154641, "auxiliary_loss_mlp": 0.01034004, "balance_loss_clip": 1.05227923, "balance_loss_mlp": 1.0253433, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 2.421810760210724, "language_loss": 0.69649732, "learning_rate": 2.5886241377181483e-06, "loss": 0.71838379, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.544175863265991 }, { "auxiliary_loss_clip": 0.01171595, "auxiliary_loss_mlp": 0.01035153, "balance_loss_clip": 1.0533402, "balance_loss_mlp": 1.02606916, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 2.072457822094871, "language_loss": 0.81299049, "learning_rate": 2.587879622585234e-06, "loss": 0.83505791, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 2.563154935836792 }, { "auxiliary_loss_clip": 0.01170818, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.05642498, "balance_loss_mlp": 1.02346539, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 1.9548951563634942, "language_loss": 0.76455414, "learning_rate": 2.5871350182714486e-06, "loss": 0.78657192, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.5727250576019287 }, { "auxiliary_loss_clip": 0.01181328, "auxiliary_loss_mlp": 0.01036689, "balance_loss_clip": 1.05291891, "balance_loss_mlp": 1.02860689, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 1.9751636009400335, "language_loss": 0.80377674, "learning_rate": 2.586390324889748e-06, "loss": 0.82595694, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 3.279618263244629 }, { "auxiliary_loss_clip": 0.01162624, "auxiliary_loss_mlp": 0.01032535, "balance_loss_clip": 1.0500623, "balance_loss_mlp": 1.02451229, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 1.7920387712202162, "language_loss": 0.67368472, "learning_rate": 2.5856455425531003e-06, "loss": 0.69563627, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.535332679748535 }, { "auxiliary_loss_clip": 0.01168216, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.02470422, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 2.4736974637994256, "language_loss": 0.80687439, "learning_rate": 2.5849006713744902e-06, "loss": 0.82888186, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 2.5192248821258545 }, { "auxiliary_loss_clip": 0.01149078, "auxiliary_loss_mlp": 0.01027338, "balance_loss_clip": 1.05004549, "balance_loss_mlp": 1.01880825, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 3.0480304614226235, "language_loss": 0.72776365, "learning_rate": 2.5841557114669135e-06, "loss": 0.74952781, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.551253318786621 }, { "auxiliary_loss_clip": 0.0118554, "auxiliary_loss_mlp": 0.01032693, "balance_loss_clip": 1.05399561, "balance_loss_mlp": 1.02409792, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 3.084171921132483, "language_loss": 0.67372882, "learning_rate": 2.58341066294338e-06, "loss": 0.69591117, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 2.446063995361328 }, { "auxiliary_loss_clip": 0.01129357, "auxiliary_loss_mlp": 0.00762126, "balance_loss_clip": 1.04828715, "balance_loss_mlp": 1.00026202, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 3.675846409416609, "language_loss": 0.85560393, "learning_rate": 2.5826655259169124e-06, "loss": 0.87451881, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 4.19012975692749 }, { "auxiliary_loss_clip": 0.01186861, "auxiliary_loss_mlp": 0.01029791, "balance_loss_clip": 1.05769765, "balance_loss_mlp": 1.02187538, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 1.8096748581740825, "language_loss": 0.90427226, "learning_rate": 2.5819203005005475e-06, "loss": 0.92643869, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 3.3526864051818848 }, { "auxiliary_loss_clip": 0.0114837, "auxiliary_loss_mlp": 0.01027161, "balance_loss_clip": 1.05152822, "balance_loss_mlp": 1.01975775, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 1.6790216191579026, "language_loss": 0.78910536, "learning_rate": 2.581174986807336e-06, "loss": 0.81086063, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.556105852127075 }, { "auxiliary_loss_clip": 0.01160695, "auxiliary_loss_mlp": 0.00762434, "balance_loss_clip": 1.05174851, "balance_loss_mlp": 1.0002265, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 2.080561334251366, "language_loss": 0.91387498, "learning_rate": 2.580429584950341e-06, "loss": 0.9331063, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 2.520254611968994 }, { "auxiliary_loss_clip": 0.01144012, "auxiliary_loss_mlp": 0.01031, "balance_loss_clip": 1.05041981, "balance_loss_mlp": 1.02239311, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 2.117491170732389, "language_loss": 0.66232002, "learning_rate": 2.5796840950426397e-06, "loss": 0.68407017, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 2.5469422340393066 }, { "auxiliary_loss_clip": 0.01159186, "auxiliary_loss_mlp": 0.01025044, "balance_loss_clip": 1.05034685, "balance_loss_mlp": 1.01699185, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 1.6774667491020243, "language_loss": 0.65460372, "learning_rate": 2.578938517197322e-06, "loss": 0.67644608, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.5285439491271973 }, { "auxiliary_loss_clip": 0.0114601, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.05062509, "balance_loss_mlp": 1.0276649, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 2.312528248818574, "language_loss": 0.6215831, "learning_rate": 2.5781928515274916e-06, "loss": 0.6433996, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.5553436279296875 }, { "auxiliary_loss_clip": 0.01173968, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.05647421, "balance_loss_mlp": 1.02170324, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 2.066287835927777, "language_loss": 0.67784119, "learning_rate": 2.577447098146265e-06, "loss": 0.69988328, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.5095884799957275 }, { "auxiliary_loss_clip": 0.01144525, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.05100989, "balance_loss_mlp": 1.0257293, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 1.6135778046324214, "language_loss": 0.79102302, "learning_rate": 2.5767012571667724e-06, "loss": 0.81280422, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.6287782192230225 }, { "auxiliary_loss_clip": 0.01168454, "auxiliary_loss_mlp": 0.01028374, "balance_loss_clip": 1.04976439, "balance_loss_mlp": 1.01955259, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 1.8427547775192907, "language_loss": 0.68474454, "learning_rate": 2.5759553287021587e-06, "loss": 0.70671278, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.512155294418335 }, { "auxiliary_loss_clip": 0.01152838, "auxiliary_loss_mlp": 0.01033142, "balance_loss_clip": 1.05119658, "balance_loss_mlp": 1.02472031, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 2.042815085442589, "language_loss": 0.77572459, "learning_rate": 2.5752093128655786e-06, "loss": 0.79758435, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.5657310485839844 }, { "auxiliary_loss_clip": 0.01146404, "auxiliary_loss_mlp": 0.01035371, "balance_loss_clip": 1.04886365, "balance_loss_mlp": 1.02679992, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 1.6545393258571086, "language_loss": 0.7355082, "learning_rate": 2.574463209770204e-06, "loss": 0.75732595, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 2.563304901123047 }, { "auxiliary_loss_clip": 0.01134553, "auxiliary_loss_mlp": 0.01033312, "balance_loss_clip": 1.04462647, "balance_loss_mlp": 1.02459157, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 1.790897842677319, "language_loss": 0.79517722, "learning_rate": 2.5737170195292165e-06, "loss": 0.81685585, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.6618492603302 }, { "auxiliary_loss_clip": 0.01142198, "auxiliary_loss_mlp": 0.01030455, "balance_loss_clip": 1.04954755, "balance_loss_mlp": 1.02264118, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 1.7539232116611918, "language_loss": 0.77965492, "learning_rate": 2.572970742255814e-06, "loss": 0.80138147, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.6171765327453613 }, { "auxiliary_loss_clip": 0.01169664, "auxiliary_loss_mlp": 0.01032677, "balance_loss_clip": 1.05544984, "balance_loss_mlp": 1.02507186, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.6098970344324968, "language_loss": 0.81619358, "learning_rate": 2.5722243780632046e-06, "loss": 0.83821702, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.5432956218719482 }, { "auxiliary_loss_clip": 0.01036457, "auxiliary_loss_mlp": 0.01001983, "balance_loss_clip": 1.0186044, "balance_loss_mlp": 1.00064743, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.7460297209605776, "language_loss": 0.60482728, "learning_rate": 2.5714779270646125e-06, "loss": 0.62521172, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.1595652103424072 }, { "auxiliary_loss_clip": 0.01159473, "auxiliary_loss_mlp": 0.00761874, "balance_loss_clip": 1.05507588, "balance_loss_mlp": 1.00023866, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 2.3529548464870906, "language_loss": 0.77055436, "learning_rate": 2.5707313893732735e-06, "loss": 0.78976786, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.5637471675872803 }, { "auxiliary_loss_clip": 0.0109095, "auxiliary_loss_mlp": 0.010262, "balance_loss_clip": 1.04064798, "balance_loss_mlp": 1.01845145, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 1.715210662677564, "language_loss": 0.76539767, "learning_rate": 2.5699847651024364e-06, "loss": 0.78656912, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 2.7320802211761475 }, { "auxiliary_loss_clip": 0.01169087, "auxiliary_loss_mlp": 0.01028247, "balance_loss_clip": 1.05576897, "balance_loss_mlp": 1.02048612, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 2.0296911777047595, "language_loss": 0.77134466, "learning_rate": 2.5692380543653627e-06, "loss": 0.79331803, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 2.580747604370117 }, { "auxiliary_loss_clip": 0.01173677, "auxiliary_loss_mlp": 0.00761962, "balance_loss_clip": 1.05470514, "balance_loss_mlp": 1.00024724, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 1.9255014320585002, "language_loss": 0.69535279, "learning_rate": 2.5684912572753293e-06, "loss": 0.71470916, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.491575002670288 }, { "auxiliary_loss_clip": 0.01179138, "auxiliary_loss_mlp": 0.01030458, "balance_loss_clip": 1.05357289, "balance_loss_mlp": 1.02338314, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 1.8082419896929556, "language_loss": 0.83797491, "learning_rate": 2.5677443739456245e-06, "loss": 0.86007088, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 2.570080280303955 }, { "auxiliary_loss_clip": 0.01154523, "auxiliary_loss_mlp": 0.01026979, "balance_loss_clip": 1.05202174, "balance_loss_mlp": 1.01952505, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.7780525164864804, "language_loss": 0.79281634, "learning_rate": 2.5669974044895495e-06, "loss": 0.81463134, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 3.3444738388061523 }, { "auxiliary_loss_clip": 0.0114776, "auxiliary_loss_mlp": 0.01026418, "balance_loss_clip": 1.05002415, "balance_loss_mlp": 1.0195874, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 1.721572569615772, "language_loss": 0.79546058, "learning_rate": 2.5662503490204187e-06, "loss": 0.81720233, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.659017562866211 }, { "auxiliary_loss_clip": 0.01151841, "auxiliary_loss_mlp": 0.01025179, "balance_loss_clip": 1.04820275, "balance_loss_mlp": 1.01747239, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 1.6924505897823745, "language_loss": 0.76018107, "learning_rate": 2.5655032076515603e-06, "loss": 0.78195119, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 2.596353530883789 }, { "auxiliary_loss_clip": 0.01155073, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 1.05291629, "balance_loss_mlp": 1.02263474, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 1.9785522239464388, "language_loss": 0.82257265, "learning_rate": 2.5647559804963155e-06, "loss": 0.844428, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.5991241931915283 }, { "auxiliary_loss_clip": 0.01134055, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 1.05037582, "balance_loss_mlp": 1.02105045, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 2.0302918210737135, "language_loss": 0.79135823, "learning_rate": 2.5640086676680364e-06, "loss": 0.81298703, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 2.64599609375 }, { "auxiliary_loss_clip": 0.01167609, "auxiliary_loss_mlp": 0.01034126, "balance_loss_clip": 1.05155659, "balance_loss_mlp": 1.02545953, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.3509104120552795, "language_loss": 0.80175078, "learning_rate": 2.5632612692800923e-06, "loss": 0.82376808, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 4.062756299972534 }, { "auxiliary_loss_clip": 0.01144022, "auxiliary_loss_mlp": 0.01029329, "balance_loss_clip": 1.05178022, "balance_loss_mlp": 1.02091837, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 2.055721493537037, "language_loss": 0.75207937, "learning_rate": 2.5625137854458603e-06, "loss": 0.77381283, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 2.63824200630188 }, { "auxiliary_loss_clip": 0.01158515, "auxiliary_loss_mlp": 0.010228, "balance_loss_clip": 1.05308771, "balance_loss_mlp": 1.01540899, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 1.9502123933766422, "language_loss": 0.802019, "learning_rate": 2.561766216278735e-06, "loss": 0.82383215, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.5512309074401855 }, { "auxiliary_loss_clip": 0.01123839, "auxiliary_loss_mlp": 0.0103535, "balance_loss_clip": 1.0474546, "balance_loss_mlp": 1.02701092, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 2.36861838428947, "language_loss": 0.81483746, "learning_rate": 2.561018561892121e-06, "loss": 0.83642936, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 2.710111379623413 }, { "auxiliary_loss_clip": 0.01150858, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.04971576, "balance_loss_mlp": 1.02256227, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 1.5264465562472036, "language_loss": 0.76468539, "learning_rate": 2.5602708223994363e-06, "loss": 0.78649735, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.5845320224761963 }, { "auxiliary_loss_clip": 0.01141492, "auxiliary_loss_mlp": 0.01032281, "balance_loss_clip": 1.04617262, "balance_loss_mlp": 1.02391791, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 4.246482863044678, "language_loss": 0.67731261, "learning_rate": 2.559522997914115e-06, "loss": 0.69905031, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.652238607406616 }, { "auxiliary_loss_clip": 0.01181905, "auxiliary_loss_mlp": 0.01029393, "balance_loss_clip": 1.05590987, "balance_loss_mlp": 1.02181101, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 2.5309860042308645, "language_loss": 0.84552777, "learning_rate": 2.558775088549599e-06, "loss": 0.86764073, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.4821133613586426 }, { "auxiliary_loss_clip": 0.01171928, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.05196667, "balance_loss_mlp": 1.02696323, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 2.841339589706848, "language_loss": 0.6683538, "learning_rate": 2.5580270944193467e-06, "loss": 0.69043118, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.4975662231445312 }, { "auxiliary_loss_clip": 0.01079801, "auxiliary_loss_mlp": 0.01003127, "balance_loss_clip": 1.02344513, "balance_loss_mlp": 1.00173807, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.746915425445617, "language_loss": 0.5549078, "learning_rate": 2.557279015636827e-06, "loss": 0.57573706, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.076401472091675 }, { "auxiliary_loss_clip": 0.01063261, "auxiliary_loss_mlp": 0.01002362, "balance_loss_clip": 1.01901424, "balance_loss_mlp": 1.00104463, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.7704086459669501, "language_loss": 0.61241019, "learning_rate": 2.5565308523155245e-06, "loss": 0.63306642, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 3.0400943756103516 }, { "auxiliary_loss_clip": 0.01124021, "auxiliary_loss_mlp": 0.01026229, "balance_loss_clip": 1.05110693, "balance_loss_mlp": 1.01843262, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 2.2350273676517443, "language_loss": 0.82064974, "learning_rate": 2.5557826045689336e-06, "loss": 0.84215224, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.602969169616699 }, { "auxiliary_loss_clip": 0.01040104, "auxiliary_loss_mlp": 0.01008519, "balance_loss_clip": 1.01976562, "balance_loss_mlp": 1.00715411, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.8228894270035056, "language_loss": 0.58820242, "learning_rate": 2.5550342725105643e-06, "loss": 0.60868865, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.1535661220550537 }, { "auxiliary_loss_clip": 0.01171641, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.05600858, "balance_loss_mlp": 1.0208497, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 1.771844185203597, "language_loss": 0.80860913, "learning_rate": 2.554285856253937e-06, "loss": 0.83061337, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 2.547060251235962 }, { "auxiliary_loss_clip": 0.01154747, "auxiliary_loss_mlp": 0.01032753, "balance_loss_clip": 1.05499101, "balance_loss_mlp": 1.02462888, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 1.7018321167782147, "language_loss": 0.77253103, "learning_rate": 2.5535373559125855e-06, "loss": 0.79440606, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.6081290245056152 }, { "auxiliary_loss_clip": 0.01099273, "auxiliary_loss_mlp": 0.01033802, "balance_loss_clip": 1.04551113, "balance_loss_mlp": 1.02549314, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 1.5543949688232546, "language_loss": 0.82085967, "learning_rate": 2.552788771600057e-06, "loss": 0.84219038, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 2.7671759128570557 }, { "auxiliary_loss_clip": 0.0114564, "auxiliary_loss_mlp": 0.01028406, "balance_loss_clip": 1.05305481, "balance_loss_mlp": 1.01999009, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 1.8908039603351283, "language_loss": 0.8197329, "learning_rate": 2.5520401034299118e-06, "loss": 0.8414734, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 2.5884132385253906 }, { "auxiliary_loss_clip": 0.01169523, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.051718, "balance_loss_mlp": 1.02167726, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 1.8221299009625975, "language_loss": 0.87862378, "learning_rate": 2.551291351515722e-06, "loss": 0.90062249, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 2.531500816345215 }, { "auxiliary_loss_clip": 0.01135825, "auxiliary_loss_mlp": 0.00761798, "balance_loss_clip": 1.04648161, "balance_loss_mlp": 1.0002017, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 1.6950210506206846, "language_loss": 0.85685831, "learning_rate": 2.5505425159710726e-06, "loss": 0.87583458, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.643615245819092 }, { "auxiliary_loss_clip": 0.01160972, "auxiliary_loss_mlp": 0.00761588, "balance_loss_clip": 1.05129385, "balance_loss_mlp": 1.00017905, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 1.7762381468990922, "language_loss": 0.82559282, "learning_rate": 2.549793596909561e-06, "loss": 0.84481847, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.5990307331085205 }, { "auxiliary_loss_clip": 0.01150202, "auxiliary_loss_mlp": 0.0102549, "balance_loss_clip": 1.05101418, "balance_loss_mlp": 1.01787901, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 1.9548974937729346, "language_loss": 0.65694249, "learning_rate": 2.5490445944447976e-06, "loss": 0.67869943, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.5296788215637207 }, { "auxiliary_loss_clip": 0.01169766, "auxiliary_loss_mlp": 0.01031722, "balance_loss_clip": 1.0541482, "balance_loss_mlp": 1.02402687, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 1.9940041934317976, "language_loss": 0.65264922, "learning_rate": 2.548295508690406e-06, "loss": 0.67466414, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 3.3866593837738037 }, { "auxiliary_loss_clip": 0.01171313, "auxiliary_loss_mlp": 0.01031306, "balance_loss_clip": 1.05283761, "balance_loss_mlp": 1.02295542, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 3.0222863447335544, "language_loss": 0.76473081, "learning_rate": 2.5475463397600217e-06, "loss": 0.78675699, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.612635612487793 }, { "auxiliary_loss_clip": 0.01189098, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.05813837, "balance_loss_mlp": 1.02238286, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 2.2183247363756102, "language_loss": 0.77527893, "learning_rate": 2.546797087767293e-06, "loss": 0.79748106, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.534398317337036 }, { "auxiliary_loss_clip": 0.01125453, "auxiliary_loss_mlp": 0.0103065, "balance_loss_clip": 1.05023336, "balance_loss_mlp": 1.02196574, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 1.727589318268984, "language_loss": 0.8742559, "learning_rate": 2.546047752825881e-06, "loss": 0.89581692, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 2.6650795936584473 }, { "auxiliary_loss_clip": 0.0113314, "auxiliary_loss_mlp": 0.01026882, "balance_loss_clip": 1.04841328, "balance_loss_mlp": 1.01896644, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 1.9764190050206698, "language_loss": 0.93142402, "learning_rate": 2.5452983350494595e-06, "loss": 0.95302415, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.5811736583709717 }, { "auxiliary_loss_clip": 0.01172147, "auxiliary_loss_mlp": 0.00761925, "balance_loss_clip": 1.05435908, "balance_loss_mlp": 1.00017154, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 11.889245555598064, "language_loss": 0.65001309, "learning_rate": 2.544548834551713e-06, "loss": 0.66935384, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.540356159210205 }, { "auxiliary_loss_clip": 0.01137615, "auxiliary_loss_mlp": 0.00761924, "balance_loss_clip": 1.05086493, "balance_loss_mlp": 1.00019598, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 2.342104251670206, "language_loss": 0.94706917, "learning_rate": 2.5437992514463424e-06, "loss": 0.96606463, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 3.4396071434020996 }, { "auxiliary_loss_clip": 0.0116622, "auxiliary_loss_mlp": 0.01032093, "balance_loss_clip": 1.05347705, "balance_loss_mlp": 1.0238378, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 2.3370439997608825, "language_loss": 0.8795765, "learning_rate": 2.5430495858470565e-06, "loss": 0.90155965, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.588655710220337 }, { "auxiliary_loss_clip": 0.0116361, "auxiliary_loss_mlp": 0.01029515, "balance_loss_clip": 1.05150926, "balance_loss_mlp": 1.02165294, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 2.4764458153396713, "language_loss": 0.77410656, "learning_rate": 2.54229983786758e-06, "loss": 0.79603785, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.483489990234375 }, { "auxiliary_loss_clip": 0.0115351, "auxiliary_loss_mlp": 0.0103103, "balance_loss_clip": 1.04918385, "balance_loss_mlp": 1.02285779, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 1.8131931564504504, "language_loss": 0.85152471, "learning_rate": 2.541550007621651e-06, "loss": 0.87337005, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 2.5920259952545166 }, { "auxiliary_loss_clip": 0.01166389, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.05363154, "balance_loss_mlp": 1.02141166, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 1.875515146419343, "language_loss": 0.79969609, "learning_rate": 2.5408000952230156e-06, "loss": 0.82165653, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 2.5608794689178467 }, { "auxiliary_loss_clip": 0.01151928, "auxiliary_loss_mlp": 0.01027627, "balance_loss_clip": 1.0510962, "balance_loss_mlp": 1.01937735, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 1.9618394321626313, "language_loss": 0.90385222, "learning_rate": 2.5400501007854357e-06, "loss": 0.92564774, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.6603164672851562 }, { "auxiliary_loss_clip": 0.0112615, "auxiliary_loss_mlp": 0.0102984, "balance_loss_clip": 1.04699039, "balance_loss_mlp": 1.02211475, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 1.7843197964945579, "language_loss": 0.7529065, "learning_rate": 2.539300024422685e-06, "loss": 0.77446634, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.617633819580078 }, { "auxiliary_loss_clip": 0.01041657, "auxiliary_loss_mlp": 0.01004346, "balance_loss_clip": 1.01801682, "balance_loss_mlp": 1.00312376, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 0.7892402288317762, "language_loss": 0.60958135, "learning_rate": 2.538549866248549e-06, "loss": 0.63004136, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 3.0012712478637695 }, { "auxiliary_loss_clip": 0.01172655, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.05470753, "balance_loss_mlp": 1.02503347, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 1.812193596638398, "language_loss": 0.81083238, "learning_rate": 2.5377996263768274e-06, "loss": 0.83289242, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.515700101852417 }, { "auxiliary_loss_clip": 0.01167175, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.05244279, "balance_loss_mlp": 1.0268476, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 1.8304490656569599, "language_loss": 0.68311709, "learning_rate": 2.5370493049213293e-06, "loss": 0.70513785, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 2.562526226043701 }, { "auxiliary_loss_clip": 0.0107879, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.04462981, "balance_loss_mlp": 1.02195466, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 1.8756317518122574, "language_loss": 0.79639399, "learning_rate": 2.536298901995878e-06, "loss": 0.81749135, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 2.8930907249450684 }, { "auxiliary_loss_clip": 0.01157198, "auxiliary_loss_mlp": 0.01032473, "balance_loss_clip": 1.05341399, "balance_loss_mlp": 1.02437901, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 1.8679843746911116, "language_loss": 0.80173123, "learning_rate": 2.535548417714311e-06, "loss": 0.82362795, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 2.6780292987823486 }, { "auxiliary_loss_clip": 0.01171361, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 1.05193388, "balance_loss_mlp": 1.02312064, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 1.4810496776651372, "language_loss": 0.87379265, "learning_rate": 2.534797852190474e-06, "loss": 0.89581877, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.5390706062316895 }, { "auxiliary_loss_clip": 0.01166681, "auxiliary_loss_mlp": 0.01034842, "balance_loss_clip": 1.0520916, "balance_loss_mlp": 1.02681923, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 1.9612873316895616, "language_loss": 0.81836873, "learning_rate": 2.5340472055382283e-06, "loss": 0.84038389, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.492030382156372 }, { "auxiliary_loss_clip": 0.01140162, "auxiliary_loss_mlp": 0.0102693, "balance_loss_clip": 1.04852247, "balance_loss_mlp": 1.01925898, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 1.9627728519318932, "language_loss": 0.81093127, "learning_rate": 2.5332964778714468e-06, "loss": 0.8326022, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.6189703941345215 }, { "auxiliary_loss_clip": 0.01141884, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.05269396, "balance_loss_mlp": 1.02178824, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 1.525306450822741, "language_loss": 0.65973306, "learning_rate": 2.5325456693040123e-06, "loss": 0.68144822, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.5647053718566895 }, { "auxiliary_loss_clip": 0.01173229, "auxiliary_loss_mlp": 0.01035829, "balance_loss_clip": 1.05179477, "balance_loss_mlp": 1.02696586, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 2.0520741408021044, "language_loss": 0.74564111, "learning_rate": 2.531794779949824e-06, "loss": 0.76773167, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 2.5061001777648926 }, { "auxiliary_loss_clip": 0.01135387, "auxiliary_loss_mlp": 0.01027681, "balance_loss_clip": 1.04943776, "balance_loss_mlp": 1.02038002, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 1.7032242848205428, "language_loss": 0.88127005, "learning_rate": 2.5310438099227903e-06, "loss": 0.90290082, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.605987310409546 }, { "auxiliary_loss_clip": 0.01064514, "auxiliary_loss_mlp": 0.0100115, "balance_loss_clip": 1.018435, "balance_loss_mlp": 0.99995786, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.8049650495500036, "language_loss": 0.53329861, "learning_rate": 2.530292759336833e-06, "loss": 0.5539552, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.178866147994995 }, { "auxiliary_loss_clip": 0.01151926, "auxiliary_loss_mlp": 0.01030004, "balance_loss_clip": 1.05294347, "balance_loss_mlp": 1.02136707, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 2.1885519149514434, "language_loss": 0.69326806, "learning_rate": 2.5295416283058855e-06, "loss": 0.71508729, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.5556068420410156 }, { "auxiliary_loss_clip": 0.01151253, "auxiliary_loss_mlp": 0.00761783, "balance_loss_clip": 1.05060267, "balance_loss_mlp": 1.00018835, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 1.9226800306517742, "language_loss": 0.66127521, "learning_rate": 2.5287904169438943e-06, "loss": 0.68040562, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 3.549539089202881 }, { "auxiliary_loss_clip": 0.01107738, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.04779124, "balance_loss_mlp": 1.02835965, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 2.6074660072986, "language_loss": 0.63858062, "learning_rate": 2.528039125364817e-06, "loss": 0.66002911, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 2.7257888317108154 }, { "auxiliary_loss_clip": 0.01147032, "auxiliary_loss_mlp": 0.01032704, "balance_loss_clip": 1.05263627, "balance_loss_mlp": 1.02438879, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 1.9103278348464414, "language_loss": 0.75973475, "learning_rate": 2.5272877536826246e-06, "loss": 0.78153211, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 2.59759259223938 }, { "auxiliary_loss_clip": 0.01126038, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.04352677, "balance_loss_mlp": 1.02624011, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 2.349886209214647, "language_loss": 0.70451754, "learning_rate": 2.5265363020112986e-06, "loss": 0.72612387, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 2.701719284057617 }, { "auxiliary_loss_clip": 0.01169244, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.05394244, "balance_loss_mlp": 1.02419066, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 1.994671950463894, "language_loss": 0.8436203, "learning_rate": 2.5257847704648344e-06, "loss": 0.86563575, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 2.57527232170105 }, { "auxiliary_loss_clip": 0.01183263, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.05555034, "balance_loss_mlp": 1.02815115, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 1.806419277832725, "language_loss": 0.75297475, "learning_rate": 2.525033159157239e-06, "loss": 0.77516645, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 4.024463653564453 }, { "auxiliary_loss_clip": 0.01166197, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.05272865, "balance_loss_mlp": 1.02108049, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 1.790854493371774, "language_loss": 0.77050787, "learning_rate": 2.52428146820253e-06, "loss": 0.79247224, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 2.5137691497802734 }, { "auxiliary_loss_clip": 0.01143894, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.05102789, "balance_loss_mlp": 1.02060235, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.668592461694098, "language_loss": 0.81774116, "learning_rate": 2.52352969771474e-06, "loss": 0.83947641, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.6179873943328857 }, { "auxiliary_loss_clip": 0.01157422, "auxiliary_loss_mlp": 0.01030938, "balance_loss_clip": 1.05284572, "balance_loss_mlp": 1.02324045, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.1331858928940233, "language_loss": 0.88414061, "learning_rate": 2.5227778478079106e-06, "loss": 0.90602422, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.5929932594299316 }, { "auxiliary_loss_clip": 0.01164585, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.05008209, "balance_loss_mlp": 1.02765942, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.5820467580025375, "language_loss": 0.76470697, "learning_rate": 2.522025918596098e-06, "loss": 0.78671157, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 2.514413356781006 }, { "auxiliary_loss_clip": 0.01167822, "auxiliary_loss_mlp": 0.01030205, "balance_loss_clip": 1.05369961, "balance_loss_mlp": 1.02299237, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 1.929499599705031, "language_loss": 0.65473723, "learning_rate": 2.521273910193368e-06, "loss": 0.67671752, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.604297399520874 }, { "auxiliary_loss_clip": 0.01174541, "auxiliary_loss_mlp": 0.01029729, "balance_loss_clip": 1.05425644, "balance_loss_mlp": 1.02130103, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 2.0825110612524864, "language_loss": 0.87266529, "learning_rate": 2.5205218227138006e-06, "loss": 0.89470804, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.4767937660217285 }, { "auxiliary_loss_clip": 0.01185633, "auxiliary_loss_mlp": 0.01029098, "balance_loss_clip": 1.05561185, "balance_loss_mlp": 1.02096224, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 1.8489097331950264, "language_loss": 0.79140967, "learning_rate": 2.519769656271486e-06, "loss": 0.81355697, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.480135440826416 }, { "auxiliary_loss_clip": 0.01116881, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.04739618, "balance_loss_mlp": 1.02241731, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 2.306512799439487, "language_loss": 0.67602122, "learning_rate": 2.5190174109805285e-06, "loss": 0.69749731, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.5902862548828125 }, { "auxiliary_loss_clip": 0.01143692, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.04842615, "balance_loss_mlp": 1.01852834, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 2.3156158105556863, "language_loss": 0.64202988, "learning_rate": 2.518265086955042e-06, "loss": 0.66373789, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.5484066009521484 }, { "auxiliary_loss_clip": 0.01185414, "auxiliary_loss_mlp": 0.0103434, "balance_loss_clip": 1.05539536, "balance_loss_mlp": 1.02628696, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 1.7525056545646718, "language_loss": 0.8388952, "learning_rate": 2.5175126843091534e-06, "loss": 0.86109269, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 2.494366407394409 }, { "auxiliary_loss_clip": 0.01157506, "auxiliary_loss_mlp": 0.01038192, "balance_loss_clip": 1.05403209, "balance_loss_mlp": 1.03056228, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.226265724524801, "language_loss": 0.75613588, "learning_rate": 2.5167602031570034e-06, "loss": 0.77809286, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.684640884399414 }, { "auxiliary_loss_clip": 0.01185882, "auxiliary_loss_mlp": 0.01025872, "balance_loss_clip": 1.05549526, "balance_loss_mlp": 1.01779008, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.6303638061974248, "language_loss": 0.73214149, "learning_rate": 2.51600764361274e-06, "loss": 0.75425899, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 2.572516441345215 }, { "auxiliary_loss_clip": 0.01183424, "auxiliary_loss_mlp": 0.01031471, "balance_loss_clip": 1.05476022, "balance_loss_mlp": 1.02309704, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 2.1166228524089754, "language_loss": 0.79079098, "learning_rate": 2.5152550057905283e-06, "loss": 0.81293988, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.5285396575927734 }, { "auxiliary_loss_clip": 0.01171933, "auxiliary_loss_mlp": 0.00762401, "balance_loss_clip": 1.05482078, "balance_loss_mlp": 1.00016975, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 2.3049703736160856, "language_loss": 0.7681092, "learning_rate": 2.5145022898045415e-06, "loss": 0.78745258, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 2.5454652309417725 }, { "auxiliary_loss_clip": 0.0115582, "auxiliary_loss_mlp": 0.01028683, "balance_loss_clip": 1.04925311, "balance_loss_mlp": 1.02011788, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 1.9958052453647352, "language_loss": 0.89839894, "learning_rate": 2.5137494957689664e-06, "loss": 0.92024398, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.5301103591918945 }, { "auxiliary_loss_clip": 0.01051256, "auxiliary_loss_mlp": 0.01001832, "balance_loss_clip": 1.01752114, "balance_loss_mlp": 1.00046754, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7668876585836026, "language_loss": 0.57349288, "learning_rate": 2.5129966237980016e-06, "loss": 0.59402382, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.1194331645965576 }, { "auxiliary_loss_clip": 0.01141451, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.04885209, "balance_loss_mlp": 1.02387547, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 1.8317883712192207, "language_loss": 0.78330994, "learning_rate": 2.512243674005857e-06, "loss": 0.80505037, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.6191089153289795 }, { "auxiliary_loss_clip": 0.01108589, "auxiliary_loss_mlp": 0.01025951, "balance_loss_clip": 1.04410207, "balance_loss_mlp": 1.01796746, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 1.7916766536241555, "language_loss": 0.86246312, "learning_rate": 2.5114906465067537e-06, "loss": 0.88380849, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 2.6998515129089355 }, { "auxiliary_loss_clip": 0.01168976, "auxiliary_loss_mlp": 0.01023736, "balance_loss_clip": 1.05112267, "balance_loss_mlp": 1.01599979, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 2.2375425216032756, "language_loss": 0.75143516, "learning_rate": 2.5107375414149264e-06, "loss": 0.77336228, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.562272787094116 }, { "auxiliary_loss_clip": 0.01116137, "auxiliary_loss_mlp": 0.01035214, "balance_loss_clip": 1.04306424, "balance_loss_mlp": 1.02655983, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 1.9807502836476336, "language_loss": 0.72257829, "learning_rate": 2.5099843588446197e-06, "loss": 0.74409175, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.586515188217163 }, { "auxiliary_loss_clip": 0.01130718, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.04946136, "balance_loss_mlp": 1.02126276, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 2.0325284165617346, "language_loss": 0.61170185, "learning_rate": 2.509231098910091e-06, "loss": 0.63330507, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 3.436128616333008 }, { "auxiliary_loss_clip": 0.01152348, "auxiliary_loss_mlp": 0.01031168, "balance_loss_clip": 1.05368948, "balance_loss_mlp": 1.02314496, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 2.4454130761593844, "language_loss": 0.74820101, "learning_rate": 2.508477761725611e-06, "loss": 0.77003616, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.542694568634033 }, { "auxiliary_loss_clip": 0.01171937, "auxiliary_loss_mlp": 0.01029426, "balance_loss_clip": 1.05187571, "balance_loss_mlp": 1.02155805, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 2.918574315264654, "language_loss": 0.80943024, "learning_rate": 2.507724347405458e-06, "loss": 0.83144391, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.5061440467834473 }, { "auxiliary_loss_clip": 0.01121003, "auxiliary_loss_mlp": 0.01028, "balance_loss_clip": 1.04417205, "balance_loss_mlp": 1.02015591, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 2.0023700866876752, "language_loss": 0.82144189, "learning_rate": 2.5069708560639243e-06, "loss": 0.84293193, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 2.6119801998138428 }, { "auxiliary_loss_clip": 0.01144345, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 1.05092967, "balance_loss_mlp": 1.02371585, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 1.8153053805938022, "language_loss": 0.61237109, "learning_rate": 2.5062172878153158e-06, "loss": 0.63413709, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 2.6256260871887207 }, { "auxiliary_loss_clip": 0.01118638, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.04738164, "balance_loss_mlp": 1.02430868, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 2.019967294684137, "language_loss": 0.87470901, "learning_rate": 2.505463642773947e-06, "loss": 0.89622366, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 4.288119554519653 }, { "auxiliary_loss_clip": 0.01141358, "auxiliary_loss_mlp": 0.0076226, "balance_loss_clip": 1.04898822, "balance_loss_mlp": 1.00013936, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 2.2194486742410326, "language_loss": 0.74859357, "learning_rate": 2.504709921054146e-06, "loss": 0.76762974, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.5614426136016846 }, { "auxiliary_loss_clip": 0.01133497, "auxiliary_loss_mlp": 0.01029387, "balance_loss_clip": 1.04402483, "balance_loss_mlp": 1.02102447, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 2.1031936585057567, "language_loss": 0.83388603, "learning_rate": 2.50395612277025e-06, "loss": 0.85551488, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.5531394481658936 }, { "auxiliary_loss_clip": 0.01159725, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.05144787, "balance_loss_mlp": 1.02270126, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 2.30116793901757, "language_loss": 0.72930813, "learning_rate": 2.503202248036612e-06, "loss": 0.75121111, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 2.550079345703125 }, { "auxiliary_loss_clip": 0.01181487, "auxiliary_loss_mlp": 0.01034671, "balance_loss_clip": 1.054497, "balance_loss_mlp": 1.02668989, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 1.7282447196458484, "language_loss": 0.73279142, "learning_rate": 2.5024482969675927e-06, "loss": 0.75495297, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.5163164138793945 }, { "auxiliary_loss_clip": 0.01128967, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.04657793, "balance_loss_mlp": 1.02258611, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 2.386931888186135, "language_loss": 0.84378922, "learning_rate": 2.501694269677566e-06, "loss": 0.86537981, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 2.667170763015747 }, { "auxiliary_loss_clip": 0.01171047, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.05074692, "balance_loss_mlp": 1.02109885, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 1.8924003049118214, "language_loss": 0.80559939, "learning_rate": 2.500940166280918e-06, "loss": 0.82760423, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.5299673080444336 }, { "auxiliary_loss_clip": 0.01167241, "auxiliary_loss_mlp": 0.01026286, "balance_loss_clip": 1.05265713, "balance_loss_mlp": 1.01863265, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 1.8993859611156458, "language_loss": 0.79189652, "learning_rate": 2.500185986892045e-06, "loss": 0.81383181, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.5598156452178955 }, { "auxiliary_loss_clip": 0.01162371, "auxiliary_loss_mlp": 0.01025076, "balance_loss_clip": 1.05010462, "balance_loss_mlp": 1.01711857, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 1.8616549633305064, "language_loss": 0.77005261, "learning_rate": 2.499431731625355e-06, "loss": 0.7919271, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.5477821826934814 }, { "auxiliary_loss_clip": 0.01183751, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.05366182, "balance_loss_mlp": 1.02496016, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 1.8146480193967212, "language_loss": 0.79521209, "learning_rate": 2.4986774005952686e-06, "loss": 0.81738687, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.5874812602996826 }, { "auxiliary_loss_clip": 0.01163634, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.05189586, "balance_loss_mlp": 1.02016091, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 2.2318340942458126, "language_loss": 0.84761918, "learning_rate": 2.4979229939162166e-06, "loss": 0.86953342, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 2.5399155616760254 }, { "auxiliary_loss_clip": 0.01167319, "auxiliary_loss_mlp": 0.01034106, "balance_loss_clip": 1.05420494, "balance_loss_mlp": 1.0258863, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.80224567131789, "language_loss": 0.80507398, "learning_rate": 2.4971685117026433e-06, "loss": 0.82708824, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.5968129634857178 }, { "auxiliary_loss_clip": 0.01169031, "auxiliary_loss_mlp": 0.01023706, "balance_loss_clip": 1.05221069, "balance_loss_mlp": 1.01613617, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 1.5238071572154641, "language_loss": 0.76567465, "learning_rate": 2.4964139540690018e-06, "loss": 0.78760201, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.555224657058716 }, { "auxiliary_loss_clip": 0.01137893, "auxiliary_loss_mlp": 0.01029401, "balance_loss_clip": 1.04836655, "balance_loss_mlp": 1.02029967, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 2.351266796541749, "language_loss": 0.72711015, "learning_rate": 2.495659321129758e-06, "loss": 0.74878311, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.604569435119629 }, { "auxiliary_loss_clip": 0.01164136, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.05023217, "balance_loss_mlp": 1.02301598, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 1.7473342370396368, "language_loss": 0.7542429, "learning_rate": 2.494904612999389e-06, "loss": 0.77619028, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.548095941543579 }, { "auxiliary_loss_clip": 0.01062955, "auxiliary_loss_mlp": 0.01004581, "balance_loss_clip": 1.01967096, "balance_loss_mlp": 1.00324535, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.7483243929496436, "language_loss": 0.5658747, "learning_rate": 2.4941498297923843e-06, "loss": 0.58655, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.0750815868377686 }, { "auxiliary_loss_clip": 0.01165031, "auxiliary_loss_mlp": 0.01028984, "balance_loss_clip": 1.05206823, "balance_loss_mlp": 1.02067471, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 1.715225972078742, "language_loss": 0.69634312, "learning_rate": 2.4933949716232424e-06, "loss": 0.7182833, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.506934642791748 }, { "auxiliary_loss_clip": 0.01137145, "auxiliary_loss_mlp": 0.0102621, "balance_loss_clip": 1.05043471, "balance_loss_mlp": 1.01831222, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 1.9858395287895985, "language_loss": 0.7346769, "learning_rate": 2.492640038606476e-06, "loss": 0.75631046, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.5861377716064453 }, { "auxiliary_loss_clip": 0.01165805, "auxiliary_loss_mlp": 0.010327, "balance_loss_clip": 1.04979563, "balance_loss_mlp": 1.02446246, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 1.9959624198261936, "language_loss": 0.78891188, "learning_rate": 2.491885030856608e-06, "loss": 0.81089693, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.4959805011749268 }, { "auxiliary_loss_clip": 0.01155053, "auxiliary_loss_mlp": 0.01031689, "balance_loss_clip": 1.05147398, "balance_loss_mlp": 1.02397609, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 2.00398963515423, "language_loss": 0.82627708, "learning_rate": 2.4911299484881713e-06, "loss": 0.84814453, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.534351348876953 }, { "auxiliary_loss_clip": 0.01148707, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.04922676, "balance_loss_mlp": 1.02190495, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 1.6537787792669818, "language_loss": 0.80927181, "learning_rate": 2.490374791615712e-06, "loss": 0.83105457, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.5626070499420166 }, { "auxiliary_loss_clip": 0.01185538, "auxiliary_loss_mlp": 0.00762872, "balance_loss_clip": 1.05554664, "balance_loss_mlp": 1.00014544, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 2.4399299879048084, "language_loss": 0.77647495, "learning_rate": 2.4896195603537867e-06, "loss": 0.79595912, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 3.260354518890381 }, { "auxiliary_loss_clip": 0.01118852, "auxiliary_loss_mlp": 0.01035343, "balance_loss_clip": 1.05047047, "balance_loss_mlp": 1.02749896, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.236622160298854, "language_loss": 0.74193943, "learning_rate": 2.488864254816964e-06, "loss": 0.76348144, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.5674123764038086 }, { "auxiliary_loss_clip": 0.01171034, "auxiliary_loss_mlp": 0.01033729, "balance_loss_clip": 1.05424976, "balance_loss_mlp": 1.0252353, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 2.6446003621770933, "language_loss": 0.6790961, "learning_rate": 2.4881088751198218e-06, "loss": 0.70114374, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 2.515728235244751 }, { "auxiliary_loss_clip": 0.01151969, "auxiliary_loss_mlp": 0.01030032, "balance_loss_clip": 1.04853034, "balance_loss_mlp": 1.02187204, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 2.5067159553675973, "language_loss": 0.64679307, "learning_rate": 2.4873534213769517e-06, "loss": 0.66861308, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 2.567798614501953 }, { "auxiliary_loss_clip": 0.0113675, "auxiliary_loss_mlp": 0.0102906, "balance_loss_clip": 1.05186558, "balance_loss_mlp": 1.02141905, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 1.8703564818113394, "language_loss": 0.71792251, "learning_rate": 2.4865978937029547e-06, "loss": 0.73958063, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 2.6023499965667725 }, { "auxiliary_loss_clip": 0.01112626, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.04566455, "balance_loss_mlp": 1.02150917, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 1.5552366077013986, "language_loss": 0.66296995, "learning_rate": 2.485842292212445e-06, "loss": 0.68438679, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 3.6000313758850098 }, { "auxiliary_loss_clip": 0.01184159, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.05536664, "balance_loss_mlp": 1.02425408, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 2.4287720431926347, "language_loss": 0.80511844, "learning_rate": 2.485086617020045e-06, "loss": 0.827281, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 3.246129035949707 }, { "auxiliary_loss_clip": 0.01145312, "auxiliary_loss_mlp": 0.01025921, "balance_loss_clip": 1.04799676, "balance_loss_mlp": 1.01786232, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 1.9764129928488916, "language_loss": 0.81934607, "learning_rate": 2.4843308682403903e-06, "loss": 0.84105837, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.5311496257781982 }, { "auxiliary_loss_clip": 0.01180921, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.05215883, "balance_loss_mlp": 1.02418709, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 1.597471782173635, "language_loss": 0.82419157, "learning_rate": 2.4835750459881294e-06, "loss": 0.84631771, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 2.4628803730010986 }, { "auxiliary_loss_clip": 0.01147614, "auxiliary_loss_mlp": 0.01035227, "balance_loss_clip": 1.04930472, "balance_loss_mlp": 1.02612531, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 1.941843068724885, "language_loss": 0.81867075, "learning_rate": 2.4828191503779177e-06, "loss": 0.84049916, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.5373754501342773 }, { "auxiliary_loss_clip": 0.01136334, "auxiliary_loss_mlp": 0.01030813, "balance_loss_clip": 1.04775023, "balance_loss_mlp": 1.02180719, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 1.8515180652248469, "language_loss": 0.89761603, "learning_rate": 2.482063181524425e-06, "loss": 0.91928756, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 2.533421754837036 }, { "auxiliary_loss_clip": 0.01185472, "auxiliary_loss_mlp": 0.01036014, "balance_loss_clip": 1.0574007, "balance_loss_mlp": 1.02663803, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 2.0852773996166207, "language_loss": 0.81583095, "learning_rate": 2.4813071395423307e-06, "loss": 0.83804578, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.4743473529815674 }, { "auxiliary_loss_clip": 0.0117028, "auxiliary_loss_mlp": 0.01030044, "balance_loss_clip": 1.0519917, "balance_loss_mlp": 1.02153277, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 1.7064208478686842, "language_loss": 0.6441437, "learning_rate": 2.4805510245463263e-06, "loss": 0.66614693, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.5392651557922363 }, { "auxiliary_loss_clip": 0.01163987, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.04937363, "balance_loss_mlp": 1.02454245, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 2.4011307182740924, "language_loss": 0.6025871, "learning_rate": 2.4797948366511137e-06, "loss": 0.62455714, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.529275417327881 }, { "auxiliary_loss_clip": 0.01135531, "auxiliary_loss_mlp": 0.01033196, "balance_loss_clip": 1.04580712, "balance_loss_mlp": 1.02498889, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 1.7447646205832903, "language_loss": 0.76046097, "learning_rate": 2.4790385759714055e-06, "loss": 0.78214824, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.607314109802246 }, { "auxiliary_loss_clip": 0.01164688, "auxiliary_loss_mlp": 0.01028166, "balance_loss_clip": 1.05341768, "balance_loss_mlp": 1.02073956, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.6664371334875647, "language_loss": 0.71180958, "learning_rate": 2.478282242621926e-06, "loss": 0.73373806, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 2.550309419631958 }, { "auxiliary_loss_clip": 0.0104595, "auxiliary_loss_mlp": 0.01003969, "balance_loss_clip": 1.02023816, "balance_loss_mlp": 1.00262761, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8436384432442813, "language_loss": 0.59605211, "learning_rate": 2.477525836717411e-06, "loss": 0.61655128, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.287449836730957 }, { "auxiliary_loss_clip": 0.0116681, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.05150414, "balance_loss_mlp": 1.02405596, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 2.381316932851427, "language_loss": 0.79098427, "learning_rate": 2.476769358372606e-06, "loss": 0.81297565, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.629042148590088 }, { "auxiliary_loss_clip": 0.01137725, "auxiliary_loss_mlp": 0.01028866, "balance_loss_clip": 1.05157721, "balance_loss_mlp": 1.02128696, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 2.769569073390376, "language_loss": 0.75174904, "learning_rate": 2.4760128077022683e-06, "loss": 0.77341491, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.557176113128662 }, { "auxiliary_loss_clip": 0.01111277, "auxiliary_loss_mlp": 0.01026427, "balance_loss_clip": 1.04482329, "balance_loss_mlp": 1.01834786, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 1.4503410054776142, "language_loss": 0.68460256, "learning_rate": 2.4752561848211672e-06, "loss": 0.70597959, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.661189317703247 }, { "auxiliary_loss_clip": 0.01166118, "auxiliary_loss_mlp": 0.01031824, "balance_loss_clip": 1.05454731, "balance_loss_mlp": 1.02439785, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 2.034989413271475, "language_loss": 0.71115094, "learning_rate": 2.4744994898440797e-06, "loss": 0.73313034, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.537574291229248 }, { "auxiliary_loss_clip": 0.01143853, "auxiliary_loss_mlp": 0.01028223, "balance_loss_clip": 1.04916382, "balance_loss_mlp": 1.01983643, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 1.9579947909509583, "language_loss": 0.83239084, "learning_rate": 2.473742722885797e-06, "loss": 0.85411167, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 2.57110333442688 }, { "auxiliary_loss_clip": 0.01167249, "auxiliary_loss_mlp": 0.00762111, "balance_loss_clip": 1.05410194, "balance_loss_mlp": 1.00019169, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 2.2732836795887255, "language_loss": 0.65231735, "learning_rate": 2.4729858840611197e-06, "loss": 0.67161095, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.5720767974853516 }, { "auxiliary_loss_clip": 0.01181796, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.05471992, "balance_loss_mlp": 1.02521467, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 1.8114658503077117, "language_loss": 0.72671103, "learning_rate": 2.4722289734848605e-06, "loss": 0.74886286, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.5232865810394287 }, { "auxiliary_loss_clip": 0.0113757, "auxiliary_loss_mlp": 0.01027967, "balance_loss_clip": 1.0527178, "balance_loss_mlp": 1.02030778, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 1.979328064028822, "language_loss": 0.77698612, "learning_rate": 2.471471991271841e-06, "loss": 0.79864144, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.5861072540283203 }, { "auxiliary_loss_clip": 0.01157678, "auxiliary_loss_mlp": 0.01030315, "balance_loss_clip": 1.04863203, "balance_loss_mlp": 1.02239418, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 1.7858626856995332, "language_loss": 0.79535162, "learning_rate": 2.470714937536896e-06, "loss": 0.81723154, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.5166759490966797 }, { "auxiliary_loss_clip": 0.01119573, "auxiliary_loss_mlp": 0.01030909, "balance_loss_clip": 1.04470587, "balance_loss_mlp": 1.02282071, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 1.8978006927696147, "language_loss": 0.70746291, "learning_rate": 2.469957812394868e-06, "loss": 0.72896773, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 3.373581647872925 }, { "auxiliary_loss_clip": 0.0118179, "auxiliary_loss_mlp": 0.01030038, "balance_loss_clip": 1.05560493, "balance_loss_mlp": 1.02205729, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 1.8888998618160189, "language_loss": 0.76250106, "learning_rate": 2.4692006159606148e-06, "loss": 0.78461933, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.5005860328674316 }, { "auxiliary_loss_clip": 0.01181065, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 1.05392253, "balance_loss_mlp": 1.01988649, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 1.6646423916833633, "language_loss": 0.78793371, "learning_rate": 2.468443348349e-06, "loss": 0.81002319, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 2.462533950805664 }, { "auxiliary_loss_clip": 0.01121264, "auxiliary_loss_mlp": 0.01026836, "balance_loss_clip": 1.04470873, "balance_loss_mlp": 1.01925445, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 2.298887330239364, "language_loss": 0.82528985, "learning_rate": 2.467686009674902e-06, "loss": 0.84677088, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 2.5998852252960205 }, { "auxiliary_loss_clip": 0.01161147, "auxiliary_loss_mlp": 0.01027201, "balance_loss_clip": 1.04958391, "balance_loss_mlp": 1.01834941, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 1.9823622068428723, "language_loss": 0.85050106, "learning_rate": 2.466928600053209e-06, "loss": 0.87238455, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 2.4864389896392822 }, { "auxiliary_loss_clip": 0.01148075, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.04715312, "balance_loss_mlp": 1.02196717, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 1.7468959035035383, "language_loss": 0.71397638, "learning_rate": 2.466171119598818e-06, "loss": 0.73575878, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 4.1176323890686035 }, { "auxiliary_loss_clip": 0.01170383, "auxiliary_loss_mlp": 0.01028538, "balance_loss_clip": 1.05006528, "balance_loss_mlp": 1.01984131, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.7584698318825305, "language_loss": 0.77125156, "learning_rate": 2.465413568426639e-06, "loss": 0.79324073, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 3.2588226795196533 }, { "auxiliary_loss_clip": 0.01165047, "auxiliary_loss_mlp": 0.01023049, "balance_loss_clip": 1.05396843, "balance_loss_mlp": 1.01546109, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 1.5992341460296036, "language_loss": 0.81109458, "learning_rate": 2.464655946651591e-06, "loss": 0.83297551, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.5403196811676025 }, { "auxiliary_loss_clip": 0.01166969, "auxiliary_loss_mlp": 0.01035452, "balance_loss_clip": 1.05196035, "balance_loss_mlp": 1.02720857, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 1.950938520072444, "language_loss": 0.8086955, "learning_rate": 2.4638982543886065e-06, "loss": 0.83071971, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.547748565673828 }, { "auxiliary_loss_clip": 0.01171683, "auxiliary_loss_mlp": 0.01028525, "balance_loss_clip": 1.05376184, "balance_loss_mlp": 1.02011466, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 5.180652532348916, "language_loss": 0.8735956, "learning_rate": 2.4631404917526254e-06, "loss": 0.89559758, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 2.4795420169830322 }, { "auxiliary_loss_clip": 0.01156708, "auxiliary_loss_mlp": 0.01025036, "balance_loss_clip": 1.04748988, "balance_loss_mlp": 1.01678097, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 1.6357021328017425, "language_loss": 0.78990614, "learning_rate": 2.4623826588586e-06, "loss": 0.81172353, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 2.5401480197906494 }, { "auxiliary_loss_clip": 0.01143427, "auxiliary_loss_mlp": 0.01025318, "balance_loss_clip": 1.04477668, "balance_loss_mlp": 1.01685452, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 1.6421963109300628, "language_loss": 0.82871526, "learning_rate": 2.461624755821492e-06, "loss": 0.85040271, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.5685486793518066 }, { "auxiliary_loss_clip": 0.01141008, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.05037796, "balance_loss_mlp": 1.02430046, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.9585472290685821, "language_loss": 0.76424009, "learning_rate": 2.4608667827562763e-06, "loss": 0.78597462, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.606872797012329 }, { "auxiliary_loss_clip": 0.01171178, "auxiliary_loss_mlp": 0.01028976, "balance_loss_clip": 1.05279326, "balance_loss_mlp": 1.02089345, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 1.9605163276579123, "language_loss": 0.90052909, "learning_rate": 2.460108739777936e-06, "loss": 0.92253065, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.5326123237609863 }, { "auxiliary_loss_clip": 0.01152105, "auxiliary_loss_mlp": 0.01028489, "balance_loss_clip": 1.05253077, "balance_loss_mlp": 1.02103806, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 1.460018936225211, "language_loss": 0.76464283, "learning_rate": 2.4593506270014656e-06, "loss": 0.78644872, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.525702714920044 }, { "auxiliary_loss_clip": 0.01152971, "auxiliary_loss_mlp": 0.01030079, "balance_loss_clip": 1.0482862, "balance_loss_mlp": 1.02188945, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 1.5547629625927406, "language_loss": 0.81860745, "learning_rate": 2.45859244454187e-06, "loss": 0.84043789, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 2.611504554748535 }, { "auxiliary_loss_clip": 0.011656, "auxiliary_loss_mlp": 0.01028378, "balance_loss_clip": 1.052652, "balance_loss_mlp": 1.02068293, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.6026801094566474, "language_loss": 0.65959865, "learning_rate": 2.4578341925141655e-06, "loss": 0.6815384, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.529740571975708 }, { "auxiliary_loss_clip": 0.01170563, "auxiliary_loss_mlp": 0.01032795, "balance_loss_clip": 1.05125046, "balance_loss_mlp": 1.02464151, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 2.431541790362739, "language_loss": 0.72273368, "learning_rate": 2.457075871033378e-06, "loss": 0.74476731, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.672551393508911 }, { "auxiliary_loss_clip": 0.01137355, "auxiliary_loss_mlp": 0.01027844, "balance_loss_clip": 1.05109489, "balance_loss_mlp": 1.01996446, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 1.9604060501236367, "language_loss": 0.88461423, "learning_rate": 2.4563174802145445e-06, "loss": 0.90626621, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 2.5745689868927 }, { "auxiliary_loss_clip": 0.01054237, "auxiliary_loss_mlp": 0.01008638, "balance_loss_clip": 1.01842332, "balance_loss_mlp": 1.00718331, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.6406451687479837, "language_loss": 0.48622996, "learning_rate": 2.455559020172712e-06, "loss": 0.50685877, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.225107431411743 }, { "auxiliary_loss_clip": 0.01129023, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.05228472, "balance_loss_mlp": 1.02420247, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 2.2250632606652596, "language_loss": 0.896837, "learning_rate": 2.4548004910229385e-06, "loss": 0.91845751, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.655611276626587 }, { "auxiliary_loss_clip": 0.01170702, "auxiliary_loss_mlp": 0.00761911, "balance_loss_clip": 1.05355299, "balance_loss_mlp": 1.00020242, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 1.7965310753674983, "language_loss": 0.8687489, "learning_rate": 2.4540418928802913e-06, "loss": 0.88807505, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.5192062854766846 }, { "auxiliary_loss_clip": 0.0115067, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.04919207, "balance_loss_mlp": 1.02528906, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.3241125390670847, "language_loss": 0.66085792, "learning_rate": 2.4532832258598506e-06, "loss": 0.68270373, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.5364935398101807 }, { "auxiliary_loss_clip": 0.01178289, "auxiliary_loss_mlp": 0.01029482, "balance_loss_clip": 1.05276024, "balance_loss_mlp": 1.02198362, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 1.783029574133204, "language_loss": 0.80561662, "learning_rate": 2.4525244900767047e-06, "loss": 0.82769436, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.5528178215026855 }, { "auxiliary_loss_clip": 0.01065124, "auxiliary_loss_mlp": 0.01013828, "balance_loss_clip": 1.02373707, "balance_loss_mlp": 1.01257038, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.8475610869453014, "language_loss": 0.60530293, "learning_rate": 2.4517656856459536e-06, "loss": 0.62609243, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.1985974311828613 }, { "auxiliary_loss_clip": 0.01163865, "auxiliary_loss_mlp": 0.01032096, "balance_loss_clip": 1.04893899, "balance_loss_mlp": 1.02441883, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 1.6781375798734215, "language_loss": 0.67935461, "learning_rate": 2.4510068126827073e-06, "loss": 0.70131421, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 2.5809292793273926 }, { "auxiliary_loss_clip": 0.01150847, "auxiliary_loss_mlp": 0.01027039, "balance_loss_clip": 1.04884779, "balance_loss_mlp": 1.01921844, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 2.1862589120238622, "language_loss": 0.81894016, "learning_rate": 2.450247871302086e-06, "loss": 0.84071898, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 3.329430103302002 }, { "auxiliary_loss_clip": 0.01168277, "auxiliary_loss_mlp": 0.01025239, "balance_loss_clip": 1.05094039, "balance_loss_mlp": 1.0178417, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 2.2351595138707494, "language_loss": 0.83607012, "learning_rate": 2.44948886161922e-06, "loss": 0.85800529, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.49894642829895 }, { "auxiliary_loss_clip": 0.01170972, "auxiliary_loss_mlp": 0.01035231, "balance_loss_clip": 1.05405581, "balance_loss_mlp": 1.02781367, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 1.5503074821111649, "language_loss": 0.85090256, "learning_rate": 2.4487297837492524e-06, "loss": 0.87296462, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 2.525719404220581 }, { "auxiliary_loss_clip": 0.01138484, "auxiliary_loss_mlp": 0.01027797, "balance_loss_clip": 1.04967093, "balance_loss_mlp": 1.01979828, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 2.240812010575333, "language_loss": 0.62692833, "learning_rate": 2.4479706378073323e-06, "loss": 0.64859116, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 2.569617509841919 }, { "auxiliary_loss_clip": 0.01126677, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.04432261, "balance_loss_mlp": 1.02454054, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 1.5422658458408698, "language_loss": 0.83756363, "learning_rate": 2.447211423908623e-06, "loss": 0.8591516, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 2.636363983154297 }, { "auxiliary_loss_clip": 0.01169902, "auxiliary_loss_mlp": 0.01029388, "balance_loss_clip": 1.05280948, "balance_loss_mlp": 1.02209878, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 2.019599411597836, "language_loss": 0.74949366, "learning_rate": 2.4464521421682966e-06, "loss": 0.77148658, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 4.126534938812256 }, { "auxiliary_loss_clip": 0.01162982, "auxiliary_loss_mlp": 0.01024951, "balance_loss_clip": 1.0525552, "balance_loss_mlp": 1.01797771, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.346617224504672, "language_loss": 0.87628913, "learning_rate": 2.4456927927015345e-06, "loss": 0.89816856, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.545527696609497 }, { "auxiliary_loss_clip": 0.01156033, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.05177712, "balance_loss_mlp": 1.02086902, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 2.1806278859943244, "language_loss": 0.76488209, "learning_rate": 2.4449333756235307e-06, "loss": 0.78673732, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.5758256912231445 }, { "auxiliary_loss_clip": 0.01168022, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.05269551, "balance_loss_mlp": 1.02389121, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 2.16023427299854, "language_loss": 0.7876147, "learning_rate": 2.4441738910494876e-06, "loss": 0.80961257, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 2.5344762802124023 }, { "auxiliary_loss_clip": 0.01155958, "auxiliary_loss_mlp": 0.01034429, "balance_loss_clip": 1.04800391, "balance_loss_mlp": 1.02628708, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 2.146947507334015, "language_loss": 0.81888247, "learning_rate": 2.4434143390946176e-06, "loss": 0.84078634, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.57564115524292 }, { "auxiliary_loss_clip": 0.01135503, "auxiliary_loss_mlp": 0.01027938, "balance_loss_clip": 1.04769921, "balance_loss_mlp": 1.02077651, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 1.9294741624848095, "language_loss": 0.85486013, "learning_rate": 2.4426547198741457e-06, "loss": 0.87649453, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 2.658766031265259 }, { "auxiliary_loss_clip": 0.0112571, "auxiliary_loss_mlp": 0.01026283, "balance_loss_clip": 1.05090213, "balance_loss_mlp": 1.01846337, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 1.9216181891323745, "language_loss": 0.74265778, "learning_rate": 2.441895033503305e-06, "loss": 0.76417768, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.60860013961792 }, { "auxiliary_loss_clip": 0.0116624, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.0521102, "balance_loss_mlp": 1.02501178, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 1.6854196319795625, "language_loss": 0.82279009, "learning_rate": 2.4411352800973375e-06, "loss": 0.84478712, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.5374560356140137 }, { "auxiliary_loss_clip": 0.01132739, "auxiliary_loss_mlp": 0.0102818, "balance_loss_clip": 1.04544508, "balance_loss_mlp": 1.02036572, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 2.5776990667171757, "language_loss": 0.75019848, "learning_rate": 2.4403754597715005e-06, "loss": 0.77180767, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.585812568664551 }, { "auxiliary_loss_clip": 0.01155317, "auxiliary_loss_mlp": 0.01026999, "balance_loss_clip": 1.04809725, "balance_loss_mlp": 1.01838601, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 2.2632990812346754, "language_loss": 0.93091381, "learning_rate": 2.4396155726410553e-06, "loss": 0.95273697, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.568756580352783 }, { "auxiliary_loss_clip": 0.01167334, "auxiliary_loss_mlp": 0.01028005, "balance_loss_clip": 1.0504899, "balance_loss_mlp": 1.02005935, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 2.2560874931145354, "language_loss": 0.90351045, "learning_rate": 2.438855618821278e-06, "loss": 0.92546391, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 2.5048890113830566 }, { "auxiliary_loss_clip": 0.01157022, "auxiliary_loss_mlp": 0.01028958, "balance_loss_clip": 1.04685879, "balance_loss_mlp": 1.02126908, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 1.6061326143039418, "language_loss": 0.67200565, "learning_rate": 2.4380955984274517e-06, "loss": 0.69386542, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.536433219909668 }, { "auxiliary_loss_clip": 0.01166814, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.05048037, "balance_loss_mlp": 1.02276921, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 1.9347400156395684, "language_loss": 0.76858413, "learning_rate": 2.4373355115748716e-06, "loss": 0.79055953, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.548346519470215 }, { "auxiliary_loss_clip": 0.01144682, "auxiliary_loss_mlp": 0.01035463, "balance_loss_clip": 1.04814243, "balance_loss_mlp": 1.02710068, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 1.7058735975554502, "language_loss": 0.72164947, "learning_rate": 2.436575358378842e-06, "loss": 0.74345094, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.5607166290283203 }, { "auxiliary_loss_clip": 0.01156465, "auxiliary_loss_mlp": 0.01037657, "balance_loss_clip": 1.04975951, "balance_loss_mlp": 1.02968812, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 10.492457842103649, "language_loss": 0.82636642, "learning_rate": 2.4358151389546782e-06, "loss": 0.84830761, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.5202276706695557 }, { "auxiliary_loss_clip": 0.01182927, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 1.05542302, "balance_loss_mlp": 1.01860905, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 2.106207601132298, "language_loss": 0.75686955, "learning_rate": 2.4350548534177035e-06, "loss": 0.77896756, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.4662587642669678 }, { "auxiliary_loss_clip": 0.01136352, "auxiliary_loss_mlp": 0.010301, "balance_loss_clip": 1.04614305, "balance_loss_mlp": 1.02228558, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 1.5456486069540167, "language_loss": 0.66515476, "learning_rate": 2.434294501883254e-06, "loss": 0.6868192, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.738900899887085 }, { "auxiliary_loss_clip": 0.01143144, "auxiliary_loss_mlp": 0.01029777, "balance_loss_clip": 1.04691172, "balance_loss_mlp": 1.02187932, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 1.7129312001303638, "language_loss": 0.65572894, "learning_rate": 2.433534084466674e-06, "loss": 0.67745811, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.568183183670044 }, { "auxiliary_loss_clip": 0.01176784, "auxiliary_loss_mlp": 0.01026838, "balance_loss_clip": 1.05175769, "balance_loss_mlp": 1.01896477, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.5230970619356274, "language_loss": 0.70447552, "learning_rate": 2.4327736012833178e-06, "loss": 0.72651172, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.5052080154418945 }, { "auxiliary_loss_clip": 0.01165577, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.05193019, "balance_loss_mlp": 1.0208478, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.064807714715758, "language_loss": 0.76358575, "learning_rate": 2.4320130524485506e-06, "loss": 0.78552973, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.529613733291626 }, { "auxiliary_loss_clip": 0.01146105, "auxiliary_loss_mlp": 0.01027992, "balance_loss_clip": 1.05029845, "balance_loss_mlp": 1.02044642, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 1.6070388774057396, "language_loss": 0.79664052, "learning_rate": 2.431252438077746e-06, "loss": 0.81838155, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 2.545245885848999 }, { "auxiliary_loss_clip": 0.01167602, "auxiliary_loss_mlp": 0.00762025, "balance_loss_clip": 1.04962873, "balance_loss_mlp": 1.00020874, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 4.197033547283291, "language_loss": 0.77635336, "learning_rate": 2.4304917582862906e-06, "loss": 0.79564959, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 3.357140302658081 }, { "auxiliary_loss_clip": 0.01183866, "auxiliary_loss_mlp": 0.01028634, "balance_loss_clip": 1.05628777, "balance_loss_mlp": 1.02091503, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 2.116973676661863, "language_loss": 0.87876272, "learning_rate": 2.4297310131895774e-06, "loss": 0.90088773, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.4591686725616455 }, { "auxiliary_loss_clip": 0.01168485, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.05278504, "balance_loss_mlp": 1.02462125, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 3.7153219381935196, "language_loss": 0.74190724, "learning_rate": 2.4289702029030113e-06, "loss": 0.76392108, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 2.494149923324585 }, { "auxiliary_loss_clip": 0.01168125, "auxiliary_loss_mlp": 0.01028537, "balance_loss_clip": 1.05263352, "balance_loss_mlp": 1.02049685, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 1.8776428544068313, "language_loss": 0.82943904, "learning_rate": 2.4282093275420057e-06, "loss": 0.85140562, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.5029714107513428 }, { "auxiliary_loss_clip": 0.01169381, "auxiliary_loss_mlp": 0.01034668, "balance_loss_clip": 1.05236161, "balance_loss_mlp": 1.02644897, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 2.0276213550221995, "language_loss": 0.70438468, "learning_rate": 2.4274483872219863e-06, "loss": 0.72642517, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.5254156589508057 }, { "auxiliary_loss_clip": 0.01163041, "auxiliary_loss_mlp": 0.01023881, "balance_loss_clip": 1.04945159, "balance_loss_mlp": 1.01678848, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 1.7256791028489342, "language_loss": 0.93629432, "learning_rate": 2.426687382058386e-06, "loss": 0.9581635, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 3.340275287628174 }, { "auxiliary_loss_clip": 0.01063666, "auxiliary_loss_mlp": 0.01004931, "balance_loss_clip": 1.0219686, "balance_loss_mlp": 1.00370932, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.8619712904131849, "language_loss": 0.59844339, "learning_rate": 2.425926312166649e-06, "loss": 0.6191293, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.679382562637329 }, { "auxiliary_loss_clip": 0.01156503, "auxiliary_loss_mlp": 0.01037421, "balance_loss_clip": 1.05011034, "balance_loss_mlp": 1.02886236, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 6.192275022314623, "language_loss": 0.73192149, "learning_rate": 2.42516517766223e-06, "loss": 0.75386071, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.5511629581451416 }, { "auxiliary_loss_clip": 0.01182641, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.05537701, "balance_loss_mlp": 1.01968443, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 2.16847034998956, "language_loss": 0.67932951, "learning_rate": 2.4244039786605907e-06, "loss": 0.70143265, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 2.530648946762085 }, { "auxiliary_loss_clip": 0.01125893, "auxiliary_loss_mlp": 0.01033397, "balance_loss_clip": 1.04650545, "balance_loss_mlp": 1.02508807, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 2.3013358157231583, "language_loss": 0.82571024, "learning_rate": 2.4236427152772055e-06, "loss": 0.84730315, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.5852949619293213 }, { "auxiliary_loss_clip": 0.010351, "auxiliary_loss_mlp": 0.01000981, "balance_loss_clip": 1.01919901, "balance_loss_mlp": 0.99984246, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.8217609648861407, "language_loss": 0.57374841, "learning_rate": 2.422881387627557e-06, "loss": 0.59410918, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 2.890310287475586 }, { "auxiliary_loss_clip": 0.01154044, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 1.05151665, "balance_loss_mlp": 1.01907563, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.660004501633974, "language_loss": 0.77376556, "learning_rate": 2.422119995827139e-06, "loss": 0.79557192, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.579089879989624 }, { "auxiliary_loss_clip": 0.01170235, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.05420017, "balance_loss_mlp": 1.02272701, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 2.3751748706337916, "language_loss": 0.74255991, "learning_rate": 2.4213585399914528e-06, "loss": 0.76457167, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.475811243057251 }, { "auxiliary_loss_clip": 0.01167074, "auxiliary_loss_mlp": 0.01024049, "balance_loss_clip": 1.05301154, "balance_loss_mlp": 1.01623487, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 1.7110419307668605, "language_loss": 0.85274756, "learning_rate": 2.4205970202360113e-06, "loss": 0.87465882, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.5008251667022705 }, { "auxiliary_loss_clip": 0.01116952, "auxiliary_loss_mlp": 0.01028766, "balance_loss_clip": 1.04703939, "balance_loss_mlp": 1.02002251, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 2.1830800775741723, "language_loss": 0.78032738, "learning_rate": 2.4198354366763354e-06, "loss": 0.80178457, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.63045072555542 }, { "auxiliary_loss_clip": 0.01154883, "auxiliary_loss_mlp": 0.010307, "balance_loss_clip": 1.04992461, "balance_loss_mlp": 1.02314186, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 1.9496122034549181, "language_loss": 0.78668541, "learning_rate": 2.4190737894279587e-06, "loss": 0.8085413, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 2.542106866836548 }, { "auxiliary_loss_clip": 0.01123846, "auxiliary_loss_mlp": 0.01028158, "balance_loss_clip": 1.04154646, "balance_loss_mlp": 1.01980138, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 2.3577224979803315, "language_loss": 0.80423629, "learning_rate": 2.4183120786064203e-06, "loss": 0.82575637, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.729404926300049 }, { "auxiliary_loss_clip": 0.01168431, "auxiliary_loss_mlp": 0.00761747, "balance_loss_clip": 1.05390978, "balance_loss_mlp": 1.00021648, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 2.316118362882173, "language_loss": 0.85375422, "learning_rate": 2.417550304327273e-06, "loss": 0.87305599, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 2.5212302207946777 }, { "auxiliary_loss_clip": 0.01184228, "auxiliary_loss_mlp": 0.01031505, "balance_loss_clip": 1.05560875, "balance_loss_mlp": 1.02297592, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 1.5746668950151628, "language_loss": 0.75656879, "learning_rate": 2.4167884667060763e-06, "loss": 0.7787261, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.5996124744415283 }, { "auxiliary_loss_clip": 0.0115192, "auxiliary_loss_mlp": 0.01039593, "balance_loss_clip": 1.04934537, "balance_loss_mlp": 1.03102231, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 2.6947670647959097, "language_loss": 0.87218684, "learning_rate": 2.4160265658584e-06, "loss": 0.89410204, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.51450514793396 }, { "auxiliary_loss_clip": 0.01169818, "auxiliary_loss_mlp": 0.01029341, "balance_loss_clip": 1.05292797, "balance_loss_mlp": 1.02104378, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 1.9568865787293412, "language_loss": 0.68479067, "learning_rate": 2.4152646018998253e-06, "loss": 0.70678228, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.5059092044830322 }, { "auxiliary_loss_clip": 0.01164538, "auxiliary_loss_mlp": 0.01026531, "balance_loss_clip": 1.05192351, "balance_loss_mlp": 1.01849663, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 1.671373828783129, "language_loss": 0.71769565, "learning_rate": 2.4145025749459403e-06, "loss": 0.73960632, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.542825698852539 }, { "auxiliary_loss_clip": 0.01095388, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.04750562, "balance_loss_mlp": 1.02216291, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 2.115509316035939, "language_loss": 0.69861931, "learning_rate": 2.413740485112344e-06, "loss": 0.71987778, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 2.7453818321228027 }, { "auxiliary_loss_clip": 0.0114622, "auxiliary_loss_mlp": 0.01030783, "balance_loss_clip": 1.05070257, "balance_loss_mlp": 1.0232724, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 1.557093654060581, "language_loss": 0.82065713, "learning_rate": 2.412978332514646e-06, "loss": 0.84242713, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 2.690563201904297 }, { "auxiliary_loss_clip": 0.01152998, "auxiliary_loss_mlp": 0.01032242, "balance_loss_clip": 1.0515883, "balance_loss_mlp": 1.02405834, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 2.028793880825944, "language_loss": 0.72269398, "learning_rate": 2.4122161172684623e-06, "loss": 0.74454635, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.635821580886841 }, { "auxiliary_loss_clip": 0.01154444, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.05321586, "balance_loss_mlp": 1.01790893, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 2.0244630287820558, "language_loss": 0.83929867, "learning_rate": 2.4114538394894216e-06, "loss": 0.86110795, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.5359854698181152 }, { "auxiliary_loss_clip": 0.01148524, "auxiliary_loss_mlp": 0.01023775, "balance_loss_clip": 1.04592478, "balance_loss_mlp": 1.01585388, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 1.772785382547845, "language_loss": 0.82975066, "learning_rate": 2.410691499293161e-06, "loss": 0.85147357, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 3.3461711406707764 }, { "auxiliary_loss_clip": 0.01165835, "auxiliary_loss_mlp": 0.01030081, "balance_loss_clip": 1.05040002, "balance_loss_mlp": 1.02220154, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 1.8424297677156591, "language_loss": 0.7385062, "learning_rate": 2.409929096795326e-06, "loss": 0.76046538, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.537637948989868 }, { "auxiliary_loss_clip": 0.01165229, "auxiliary_loss_mlp": 0.01032001, "balance_loss_clip": 1.04989612, "balance_loss_mlp": 1.0237577, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 1.996365031138014, "language_loss": 0.792328, "learning_rate": 2.409166632111573e-06, "loss": 0.8143003, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 2.5109753608703613 }, { "auxiliary_loss_clip": 0.0117402, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.0528717, "balance_loss_mlp": 1.0209012, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 1.921833505831142, "language_loss": 0.80469263, "learning_rate": 2.4084041053575674e-06, "loss": 0.82672888, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 2.5416147708892822 }, { "auxiliary_loss_clip": 0.01158654, "auxiliary_loss_mlp": 0.0103094, "balance_loss_clip": 1.05413532, "balance_loss_mlp": 1.0228039, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 1.977531459274755, "language_loss": 0.72547853, "learning_rate": 2.4076415166489834e-06, "loss": 0.74737448, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 2.549978256225586 }, { "auxiliary_loss_clip": 0.01129312, "auxiliary_loss_mlp": 0.01029098, "balance_loss_clip": 1.0475142, "balance_loss_mlp": 1.02183509, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 1.4973980060039136, "language_loss": 0.78973055, "learning_rate": 2.406878866101506e-06, "loss": 0.8113147, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 3.3884408473968506 }, { "auxiliary_loss_clip": 0.01180577, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.05456805, "balance_loss_mlp": 1.02171397, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 2.694086365620156, "language_loss": 0.78530377, "learning_rate": 2.4061161538308273e-06, "loss": 0.80740631, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 3.2624094486236572 }, { "auxiliary_loss_clip": 0.01167492, "auxiliary_loss_mlp": 0.01027073, "balance_loss_clip": 1.05339313, "balance_loss_mlp": 1.01934183, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 1.900172915747642, "language_loss": 0.88665944, "learning_rate": 2.4053533799526523e-06, "loss": 0.9086051, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.4971325397491455 }, { "auxiliary_loss_clip": 0.01148891, "auxiliary_loss_mlp": 0.0103088, "balance_loss_clip": 1.05199122, "balance_loss_mlp": 1.02358985, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.6801494337811582, "language_loss": 0.86253804, "learning_rate": 2.404590544582691e-06, "loss": 0.88433576, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.5955586433410645 }, { "auxiliary_loss_clip": 0.0112755, "auxiliary_loss_mlp": 0.01032287, "balance_loss_clip": 1.04489613, "balance_loss_mlp": 1.02411509, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 1.5745157694883132, "language_loss": 0.80606765, "learning_rate": 2.403827647836666e-06, "loss": 0.82766604, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 2.7795450687408447 }, { "auxiliary_loss_clip": 0.01180162, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.05227709, "balance_loss_mlp": 1.01797581, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 2.0972195759709322, "language_loss": 0.69855225, "learning_rate": 2.4030646898303075e-06, "loss": 0.72060812, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 2.5028369426727295 }, { "auxiliary_loss_clip": 0.01158184, "auxiliary_loss_mlp": 0.01032132, "balance_loss_clip": 1.05061162, "balance_loss_mlp": 1.02431154, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 2.0851574561029866, "language_loss": 0.82166994, "learning_rate": 2.4023016706793566e-06, "loss": 0.84357309, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.6181085109710693 }, { "auxiliary_loss_clip": 0.01044042, "auxiliary_loss_mlp": 0.01003417, "balance_loss_clip": 1.01589954, "balance_loss_mlp": 1.00212944, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7649879260504175, "language_loss": 0.56857371, "learning_rate": 2.401538590499561e-06, "loss": 0.58904827, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.2094645500183105 }, { "auxiliary_loss_clip": 0.01167742, "auxiliary_loss_mlp": 0.00761519, "balance_loss_clip": 1.0504601, "balance_loss_mlp": 1.00021195, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 5.202569655840854, "language_loss": 0.71258479, "learning_rate": 2.400775449406682e-06, "loss": 0.73187739, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.60562801361084 }, { "auxiliary_loss_clip": 0.01163676, "auxiliary_loss_mlp": 0.01031993, "balance_loss_clip": 1.04935014, "balance_loss_mlp": 1.0246613, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 1.7673253345238815, "language_loss": 0.73109686, "learning_rate": 2.400012247516485e-06, "loss": 0.75305355, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.5212621688842773 }, { "auxiliary_loss_clip": 0.01139534, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.04680789, "balance_loss_mlp": 1.02159715, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 1.7500070089493907, "language_loss": 0.90265751, "learning_rate": 2.3992489849447484e-06, "loss": 0.92434555, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 2.654156446456909 }, { "auxiliary_loss_clip": 0.01141357, "auxiliary_loss_mlp": 0.01030741, "balance_loss_clip": 1.04794121, "balance_loss_mlp": 1.02287936, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 2.545577789465279, "language_loss": 0.78824234, "learning_rate": 2.3984856618072584e-06, "loss": 0.80996335, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.6796483993530273 }, { "auxiliary_loss_clip": 0.01142118, "auxiliary_loss_mlp": 0.01028332, "balance_loss_clip": 1.0478586, "balance_loss_mlp": 1.02046978, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 1.9591170176243695, "language_loss": 0.74333227, "learning_rate": 2.3977222782198098e-06, "loss": 0.76503682, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.6364450454711914 }, { "auxiliary_loss_clip": 0.01125972, "auxiliary_loss_mlp": 0.01026841, "balance_loss_clip": 1.04405594, "balance_loss_mlp": 1.01839519, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 1.7229852175809637, "language_loss": 0.75324547, "learning_rate": 2.3969588342982077e-06, "loss": 0.77477366, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.5933375358581543 }, { "auxiliary_loss_clip": 0.01166781, "auxiliary_loss_mlp": 0.01027363, "balance_loss_clip": 1.05430102, "balance_loss_mlp": 1.01973379, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 2.0172318808423984, "language_loss": 0.72666621, "learning_rate": 2.396195330158267e-06, "loss": 0.74860764, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.567723274230957 }, { "auxiliary_loss_clip": 0.01180011, "auxiliary_loss_mlp": 0.01028095, "balance_loss_clip": 1.0528878, "balance_loss_mlp": 1.02021551, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 1.7618341475052894, "language_loss": 0.79252362, "learning_rate": 2.3954317659158094e-06, "loss": 0.8146047, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.557277202606201 }, { "auxiliary_loss_clip": 0.01070173, "auxiliary_loss_mlp": 0.01001143, "balance_loss_clip": 1.01523507, "balance_loss_mlp": 0.99980748, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.8868259617590143, "language_loss": 0.56933814, "learning_rate": 2.394668141686667e-06, "loss": 0.59005129, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.12005352973938 }, { "auxiliary_loss_clip": 0.01161226, "auxiliary_loss_mlp": 0.01027976, "balance_loss_clip": 1.04910946, "balance_loss_mlp": 1.01991129, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 3.5653974189662967, "language_loss": 0.69459838, "learning_rate": 2.3939044575866813e-06, "loss": 0.71649045, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.715855360031128 }, { "auxiliary_loss_clip": 0.01145807, "auxiliary_loss_mlp": 0.00761299, "balance_loss_clip": 1.04776955, "balance_loss_mlp": 1.00016522, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 2.0428615785557596, "language_loss": 0.75969523, "learning_rate": 2.3931407137317024e-06, "loss": 0.77876627, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.6729605197906494 }, { "auxiliary_loss_clip": 0.01130669, "auxiliary_loss_mlp": 0.01030223, "balance_loss_clip": 1.04455352, "balance_loss_mlp": 1.02268887, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 1.680903914707323, "language_loss": 0.85047662, "learning_rate": 2.3923769102375907e-06, "loss": 0.87208557, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 2.5943236351013184 }, { "auxiliary_loss_clip": 0.01132686, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.04284859, "balance_loss_mlp": 1.02198529, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 2.2590623624294137, "language_loss": 0.78322482, "learning_rate": 2.391613047220213e-06, "loss": 0.8048507, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.6010138988494873 }, { "auxiliary_loss_clip": 0.01127349, "auxiliary_loss_mlp": 0.0102632, "balance_loss_clip": 1.04552877, "balance_loss_mlp": 1.01818955, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 1.9334093914807406, "language_loss": 0.78960192, "learning_rate": 2.390849124795447e-06, "loss": 0.81113863, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 3.4059925079345703 }, { "auxiliary_loss_clip": 0.01179921, "auxiliary_loss_mlp": 0.01033161, "balance_loss_clip": 1.05292284, "balance_loss_mlp": 1.02595472, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 3.7446396467162018, "language_loss": 0.84076071, "learning_rate": 2.3900851430791804e-06, "loss": 0.8628915, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.4777472019195557 }, { "auxiliary_loss_clip": 0.01183709, "auxiliary_loss_mlp": 0.01031912, "balance_loss_clip": 1.05449033, "balance_loss_mlp": 1.02393651, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 1.9315845149299342, "language_loss": 0.84753847, "learning_rate": 2.389321102187307e-06, "loss": 0.86969471, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 2.492516040802002 }, { "auxiliary_loss_clip": 0.01150123, "auxiliary_loss_mlp": 0.00762172, "balance_loss_clip": 1.05113888, "balance_loss_mlp": 1.00020003, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 1.9053929084710592, "language_loss": 0.81403887, "learning_rate": 2.3885570022357326e-06, "loss": 0.83316183, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 2.568053960800171 }, { "auxiliary_loss_clip": 0.01040487, "auxiliary_loss_mlp": 0.01000187, "balance_loss_clip": 1.01374555, "balance_loss_mlp": 0.99888211, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.7973027278109948, "language_loss": 0.60883373, "learning_rate": 2.38779284334037e-06, "loss": 0.62924051, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 3.2188708782196045 }, { "auxiliary_loss_clip": 0.01108907, "auxiliary_loss_mlp": 0.01027455, "balance_loss_clip": 1.04292166, "balance_loss_mlp": 1.01944959, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 2.0087927195568245, "language_loss": 0.78783935, "learning_rate": 2.387028625617141e-06, "loss": 0.80920303, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 4.199683904647827 }, { "auxiliary_loss_clip": 0.01136009, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.0439204, "balance_loss_mlp": 1.02131116, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 1.9232732299776745, "language_loss": 0.84856898, "learning_rate": 2.3862643491819766e-06, "loss": 0.87022281, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 3.353611469268799 }, { "auxiliary_loss_clip": 0.01161471, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.04868889, "balance_loss_mlp": 1.02332664, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 1.7986853295778558, "language_loss": 0.84642726, "learning_rate": 2.3855000141508186e-06, "loss": 0.86835635, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.517411470413208 }, { "auxiliary_loss_clip": 0.01153024, "auxiliary_loss_mlp": 0.01031393, "balance_loss_clip": 1.05185723, "balance_loss_mlp": 1.02368021, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.1638363965872585, "language_loss": 0.84160626, "learning_rate": 2.3847356206396143e-06, "loss": 0.86345047, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 2.5667545795440674 }, { "auxiliary_loss_clip": 0.011788, "auxiliary_loss_mlp": 0.01026503, "balance_loss_clip": 1.05261099, "balance_loss_mlp": 1.0182476, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.8993395746155657, "language_loss": 0.785941, "learning_rate": 2.3839711687643227e-06, "loss": 0.80799401, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.493812084197998 }, { "auxiliary_loss_clip": 0.01164581, "auxiliary_loss_mlp": 0.01036055, "balance_loss_clip": 1.05120993, "balance_loss_mlp": 1.02697146, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 1.8965805983731625, "language_loss": 0.74043846, "learning_rate": 2.38320665864091e-06, "loss": 0.76244485, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 2.5088119506835938 }, { "auxiliary_loss_clip": 0.01109202, "auxiliary_loss_mlp": 0.01026415, "balance_loss_clip": 1.042243, "balance_loss_mlp": 1.01835692, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 2.18090340115957, "language_loss": 0.81917691, "learning_rate": 2.3824420903853516e-06, "loss": 0.84053314, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.645024061203003 }, { "auxiliary_loss_clip": 0.01163526, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.05225778, "balance_loss_mlp": 1.02034342, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 2.4949820159595957, "language_loss": 0.81972516, "learning_rate": 2.3816774641136324e-06, "loss": 0.84164083, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.5227949619293213 }, { "auxiliary_loss_clip": 0.01164165, "auxiliary_loss_mlp": 0.00762096, "balance_loss_clip": 1.05221486, "balance_loss_mlp": 1.00019598, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 1.801857214666825, "language_loss": 0.71666884, "learning_rate": 2.380912779941745e-06, "loss": 0.73593146, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.6076061725616455 }, { "auxiliary_loss_clip": 0.0116482, "auxiliary_loss_mlp": 0.01032322, "balance_loss_clip": 1.0475359, "balance_loss_mlp": 1.02392364, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 2.3027762903335742, "language_loss": 0.83261931, "learning_rate": 2.3801480379856918e-06, "loss": 0.85459077, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.561879873275757 }, { "auxiliary_loss_clip": 0.01153728, "auxiliary_loss_mlp": 0.01029146, "balance_loss_clip": 1.05192685, "balance_loss_mlp": 1.02167797, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 1.6934736593844248, "language_loss": 0.8375355, "learning_rate": 2.379383238361484e-06, "loss": 0.85936421, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.5379748344421387 }, { "auxiliary_loss_clip": 0.01162525, "auxiliary_loss_mlp": 0.01030195, "balance_loss_clip": 1.04995668, "balance_loss_mlp": 1.02285802, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 2.0553376732504582, "language_loss": 0.79870224, "learning_rate": 2.3786183811851407e-06, "loss": 0.82062936, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.6312150955200195 }, { "auxiliary_loss_clip": 0.01180709, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.05374098, "balance_loss_mlp": 1.02926183, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 1.5802467703076075, "language_loss": 0.80032527, "learning_rate": 2.3778534665726892e-06, "loss": 0.82250202, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.450556993484497 }, { "auxiliary_loss_clip": 0.01155335, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.04955709, "balance_loss_mlp": 1.02194107, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 1.8152062035181948, "language_loss": 0.72958767, "learning_rate": 2.377088494640168e-06, "loss": 0.75143832, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 2.5993428230285645 }, { "auxiliary_loss_clip": 0.01161029, "auxiliary_loss_mlp": 0.01026251, "balance_loss_clip": 1.05149388, "balance_loss_mlp": 1.01860404, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 1.792088329194526, "language_loss": 0.78275025, "learning_rate": 2.3763234655036216e-06, "loss": 0.80462307, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.4977407455444336 }, { "auxiliary_loss_clip": 0.01135705, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.04448485, "balance_loss_mlp": 1.02767086, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 2.1049223235504786, "language_loss": 0.8714186, "learning_rate": 2.3755583792791046e-06, "loss": 0.89313036, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.6019344329833984 }, { "auxiliary_loss_clip": 0.01167196, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 1.05087304, "balance_loss_mlp": 1.02815986, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 1.8613305574932022, "language_loss": 0.74481493, "learning_rate": 2.3747932360826803e-06, "loss": 0.76684415, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.472931146621704 }, { "auxiliary_loss_clip": 0.01165934, "auxiliary_loss_mlp": 0.01024436, "balance_loss_clip": 1.05413246, "balance_loss_mlp": 1.01612139, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 2.1837763210766377, "language_loss": 0.82427132, "learning_rate": 2.3740280360304205e-06, "loss": 0.84617507, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.4984776973724365 }, { "auxiliary_loss_clip": 0.01139938, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.0512948, "balance_loss_mlp": 1.02480721, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 1.7945169905518796, "language_loss": 0.68115139, "learning_rate": 2.3732627792384038e-06, "loss": 0.70287275, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.592421770095825 }, { "auxiliary_loss_clip": 0.01177858, "auxiliary_loss_mlp": 0.01023501, "balance_loss_clip": 1.05098438, "balance_loss_mlp": 1.01615763, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 1.8708389401223504, "language_loss": 0.75378042, "learning_rate": 2.3724974658227207e-06, "loss": 0.77579403, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.544110059738159 }, { "auxiliary_loss_clip": 0.01150097, "auxiliary_loss_mlp": 0.00762382, "balance_loss_clip": 1.05009663, "balance_loss_mlp": 1.00019372, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 1.8379132774020817, "language_loss": 0.71100938, "learning_rate": 2.3717320958994687e-06, "loss": 0.73013413, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 2.571352005004883 }, { "auxiliary_loss_clip": 0.01136345, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.04312027, "balance_loss_mlp": 1.02887511, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 1.8578063434708478, "language_loss": 0.70411956, "learning_rate": 2.3709666695847534e-06, "loss": 0.72585273, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 3.422168731689453 }, { "auxiliary_loss_clip": 0.01116562, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 1.04452276, "balance_loss_mlp": 1.02060986, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 1.5551688135112223, "language_loss": 0.70166409, "learning_rate": 2.370201186994689e-06, "loss": 0.72310889, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.8034238815307617 }, { "auxiliary_loss_clip": 0.01142309, "auxiliary_loss_mlp": 0.01028631, "balance_loss_clip": 1.0493536, "balance_loss_mlp": 1.02066207, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 1.9353637129858212, "language_loss": 0.69575, "learning_rate": 2.369435648245399e-06, "loss": 0.71745944, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 2.6137192249298096 }, { "auxiliary_loss_clip": 0.01149112, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.04974532, "balance_loss_mlp": 1.02370358, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 1.636798247198128, "language_loss": 0.85214365, "learning_rate": 2.368670053453015e-06, "loss": 0.8739512, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 2.563408851623535 }, { "auxiliary_loss_clip": 0.01170345, "auxiliary_loss_mlp": 0.01027881, "balance_loss_clip": 1.05289364, "balance_loss_mlp": 1.02008426, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 2.0851831387036346, "language_loss": 0.74079561, "learning_rate": 2.3679044027336757e-06, "loss": 0.76277786, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 2.463207483291626 }, { "auxiliary_loss_clip": 0.0117976, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.05269849, "balance_loss_mlp": 1.02129769, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 2.6388820831048654, "language_loss": 0.69075012, "learning_rate": 2.3671386962035326e-06, "loss": 0.71283996, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 3.9721713066101074 }, { "auxiliary_loss_clip": 0.01165379, "auxiliary_loss_mlp": 0.01032264, "balance_loss_clip": 1.0510819, "balance_loss_mlp": 1.02402639, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 2.0458670841621833, "language_loss": 0.68644786, "learning_rate": 2.3663729339787405e-06, "loss": 0.70842433, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 3.324791669845581 }, { "auxiliary_loss_clip": 0.01177861, "auxiliary_loss_mlp": 0.01033448, "balance_loss_clip": 1.05096436, "balance_loss_mlp": 1.02426934, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 2.713546743892018, "language_loss": 0.73684931, "learning_rate": 2.365607116175466e-06, "loss": 0.75896239, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 2.4307312965393066 }, { "auxiliary_loss_clip": 0.01179175, "auxiliary_loss_mlp": 0.01026513, "balance_loss_clip": 1.05358887, "balance_loss_mlp": 1.01863337, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 2.1698646906564405, "language_loss": 0.66494989, "learning_rate": 2.3648412429098825e-06, "loss": 0.68700677, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.4761080741882324 }, { "auxiliary_loss_clip": 0.01135933, "auxiliary_loss_mlp": 0.01040697, "balance_loss_clip": 1.05037785, "balance_loss_mlp": 1.03250158, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 1.949084677617244, "language_loss": 0.81928158, "learning_rate": 2.364075314298172e-06, "loss": 0.84104788, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 2.575054407119751 }, { "auxiliary_loss_clip": 0.01166659, "auxiliary_loss_mlp": 0.00761607, "balance_loss_clip": 1.05042815, "balance_loss_mlp": 1.00015831, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 1.8160267284931182, "language_loss": 0.70252156, "learning_rate": 2.3633093304565267e-06, "loss": 0.72180426, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.5105133056640625 }, { "auxiliary_loss_clip": 0.0118707, "auxiliary_loss_mlp": 0.01029752, "balance_loss_clip": 1.05795026, "balance_loss_mlp": 1.02178264, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 1.8442737339484487, "language_loss": 0.63193285, "learning_rate": 2.3625432915011443e-06, "loss": 0.65410107, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 2.5119123458862305 }, { "auxiliary_loss_clip": 0.01144357, "auxiliary_loss_mlp": 0.0102406, "balance_loss_clip": 1.04796159, "balance_loss_mlp": 1.01630497, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 1.6874672628438991, "language_loss": 0.6479466, "learning_rate": 2.3617771975482334e-06, "loss": 0.66963077, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.5616648197174072 }, { "auxiliary_loss_clip": 0.01120326, "auxiliary_loss_mlp": 0.01029604, "balance_loss_clip": 1.0471251, "balance_loss_mlp": 1.02266622, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 1.5948200773387111, "language_loss": 0.7441653, "learning_rate": 2.3610110487140083e-06, "loss": 0.76566458, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.5911097526550293 }, { "auxiliary_loss_clip": 0.01148852, "auxiliary_loss_mlp": 0.01026335, "balance_loss_clip": 1.04917026, "balance_loss_mlp": 1.01888466, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.8560785538958022, "language_loss": 0.80775893, "learning_rate": 2.360244845114695e-06, "loss": 0.82951081, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.590060234069824 }, { "auxiliary_loss_clip": 0.01145822, "auxiliary_loss_mlp": 0.01026081, "balance_loss_clip": 1.05029237, "balance_loss_mlp": 1.01785541, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.7665287631213835, "language_loss": 0.68278855, "learning_rate": 2.3594785868665245e-06, "loss": 0.70450759, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.5120010375976562 }, { "auxiliary_loss_clip": 0.01138766, "auxiliary_loss_mlp": 0.00762228, "balance_loss_clip": 1.04785728, "balance_loss_mlp": 1.00021803, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 2.0117359522772884, "language_loss": 0.80365419, "learning_rate": 2.3587122740857386e-06, "loss": 0.82266414, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.579968214035034 }, { "auxiliary_loss_clip": 0.01166148, "auxiliary_loss_mlp": 0.0102856, "balance_loss_clip": 1.05194831, "balance_loss_mlp": 1.02125573, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.667297621637011, "language_loss": 0.7783882, "learning_rate": 2.357945906888586e-06, "loss": 0.80033535, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.5351154804229736 }, { "auxiliary_loss_clip": 0.01167951, "auxiliary_loss_mlp": 0.01028102, "balance_loss_clip": 1.05287886, "balance_loss_mlp": 1.0196085, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 2.1802845660241017, "language_loss": 0.79489684, "learning_rate": 2.357179485391324e-06, "loss": 0.81685734, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 2.5007171630859375 }, { "auxiliary_loss_clip": 0.01178089, "auxiliary_loss_mlp": 0.01035217, "balance_loss_clip": 1.05351627, "balance_loss_mlp": 1.02780795, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 1.8434412700223115, "language_loss": 0.86401498, "learning_rate": 2.3564130097102173e-06, "loss": 0.8861481, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.481332302093506 }, { "auxiliary_loss_clip": 0.01144941, "auxiliary_loss_mlp": 0.01027494, "balance_loss_clip": 1.05209458, "balance_loss_mlp": 1.01951897, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 4.074730774137933, "language_loss": 0.74850452, "learning_rate": 2.355646479961541e-06, "loss": 0.7702288, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.612250328063965 }, { "auxiliary_loss_clip": 0.01178564, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.05194402, "balance_loss_mlp": 1.02534223, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 1.878286627401379, "language_loss": 0.7144261, "learning_rate": 2.354879896261576e-06, "loss": 0.73654485, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.5806679725646973 }, { "auxiliary_loss_clip": 0.01135159, "auxiliary_loss_mlp": 0.01028494, "balance_loss_clip": 1.05120504, "balance_loss_mlp": 1.02030456, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 5.522962680050381, "language_loss": 0.56786686, "learning_rate": 2.3541132587266133e-06, "loss": 0.58950341, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.697087049484253 }, { "auxiliary_loss_clip": 0.01142246, "auxiliary_loss_mlp": 0.01034288, "balance_loss_clip": 1.04981399, "balance_loss_mlp": 1.02683115, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 1.7571933025394342, "language_loss": 0.69240391, "learning_rate": 2.3533465674729515e-06, "loss": 0.71416926, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.564023733139038 }, { "auxiliary_loss_clip": 0.01182661, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.05570102, "balance_loss_mlp": 1.02357578, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 1.9792846762446406, "language_loss": 0.72898108, "learning_rate": 2.352579822616895e-06, "loss": 0.75112683, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.4639105796813965 }, { "auxiliary_loss_clip": 0.01153657, "auxiliary_loss_mlp": 0.0102472, "balance_loss_clip": 1.05189013, "balance_loss_mlp": 1.01701283, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 1.9379010216362937, "language_loss": 0.77922392, "learning_rate": 2.351813024274761e-06, "loss": 0.80100763, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.583096742630005 }, { "auxiliary_loss_clip": 0.01140975, "auxiliary_loss_mlp": 0.0103261, "balance_loss_clip": 1.04955423, "balance_loss_mlp": 1.02446246, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 1.7621040674671256, "language_loss": 0.73219037, "learning_rate": 2.3510461725628693e-06, "loss": 0.75392628, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 3.399188280105591 }, { "auxiliary_loss_clip": 0.0114183, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.05097222, "balance_loss_mlp": 1.02465415, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 1.9365507423421742, "language_loss": 0.70671248, "learning_rate": 2.350279267597554e-06, "loss": 0.72845608, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.6219053268432617 }, { "auxiliary_loss_clip": 0.01166094, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.05276155, "balance_loss_mlp": 1.02593708, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 1.943939318866429, "language_loss": 0.82441229, "learning_rate": 2.3495123094951515e-06, "loss": 0.84641576, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.494882822036743 }, { "auxiliary_loss_clip": 0.01146752, "auxiliary_loss_mlp": 0.01034446, "balance_loss_clip": 1.05080533, "balance_loss_mlp": 1.02626824, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 4.90343666593536, "language_loss": 0.75708663, "learning_rate": 2.34874529837201e-06, "loss": 0.7788986, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 2.812138795852661 }, { "auxiliary_loss_clip": 0.01104341, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.04446983, "balance_loss_mlp": 1.02024853, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 1.9892262990228236, "language_loss": 0.79303271, "learning_rate": 2.347978234344483e-06, "loss": 0.81435472, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.6123430728912354 }, { "auxiliary_loss_clip": 0.01171124, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.05386639, "balance_loss_mlp": 1.02892077, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.6233061100839423, "language_loss": 0.68726754, "learning_rate": 2.347211117528935e-06, "loss": 0.70936263, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 3.4536683559417725 }, { "auxiliary_loss_clip": 0.0114966, "auxiliary_loss_mlp": 0.01031977, "balance_loss_clip": 1.05498195, "balance_loss_mlp": 1.02394795, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 1.5283580941011288, "language_loss": 0.71693909, "learning_rate": 2.3464439480417374e-06, "loss": 0.73875546, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 3.4179673194885254 }, { "auxiliary_loss_clip": 0.0117152, "auxiliary_loss_mlp": 0.01024943, "balance_loss_clip": 1.05468369, "balance_loss_mlp": 1.01671147, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 2.8021381161504975, "language_loss": 0.77459174, "learning_rate": 2.3456767259992676e-06, "loss": 0.79655635, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.4996180534362793 }, { "auxiliary_loss_clip": 0.01181886, "auxiliary_loss_mlp": 0.00761881, "balance_loss_clip": 1.05393863, "balance_loss_mlp": 1.00018942, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 2.1802875904668464, "language_loss": 0.88705122, "learning_rate": 2.3449094515179135e-06, "loss": 0.9064889, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 2.446885585784912 }, { "auxiliary_loss_clip": 0.01155503, "auxiliary_loss_mlp": 0.01032577, "balance_loss_clip": 1.04916298, "balance_loss_mlp": 1.02518606, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 1.6630711569477297, "language_loss": 0.81721914, "learning_rate": 2.34414212471407e-06, "loss": 0.83909988, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.577338695526123 }, { "auxiliary_loss_clip": 0.01172094, "auxiliary_loss_mlp": 0.0102852, "balance_loss_clip": 1.05262852, "balance_loss_mlp": 1.02053905, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 2.0464323185763496, "language_loss": 0.72511089, "learning_rate": 2.3433747457041394e-06, "loss": 0.74711698, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.4908883571624756 }, { "auxiliary_loss_clip": 0.01136729, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.05061829, "balance_loss_mlp": 1.02337766, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 1.8362775405161145, "language_loss": 0.8485496, "learning_rate": 2.342607314604533e-06, "loss": 0.87023616, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 2.6505284309387207 }, { "auxiliary_loss_clip": 0.01164147, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.05355704, "balance_loss_mlp": 1.02855885, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 2.019988810713038, "language_loss": 0.83939052, "learning_rate": 2.3418398315316694e-06, "loss": 0.86140245, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.5004162788391113 }, { "auxiliary_loss_clip": 0.01180829, "auxiliary_loss_mlp": 0.01031336, "balance_loss_clip": 1.05473757, "balance_loss_mlp": 1.02297378, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 2.2762700472927446, "language_loss": 0.78476191, "learning_rate": 2.3410722966019755e-06, "loss": 0.80688357, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.461857557296753 }, { "auxiliary_loss_clip": 0.0116631, "auxiliary_loss_mlp": 0.01030328, "balance_loss_clip": 1.05304205, "balance_loss_mlp": 1.02240109, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 1.6584498640583603, "language_loss": 0.65746927, "learning_rate": 2.3403047099318848e-06, "loss": 0.67943561, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.643996000289917 }, { "auxiliary_loss_clip": 0.01118465, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.04554474, "balance_loss_mlp": 1.0228399, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.1344106447397198, "language_loss": 0.75169057, "learning_rate": 2.3395370716378405e-06, "loss": 0.77318454, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.60945987701416 }, { "auxiliary_loss_clip": 0.01166518, "auxiliary_loss_mlp": 0.01029116, "balance_loss_clip": 1.05021822, "balance_loss_mlp": 1.02145708, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 2.0350382128080815, "language_loss": 0.71876955, "learning_rate": 2.338769381836292e-06, "loss": 0.74072593, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.5322506427764893 }, { "auxiliary_loss_clip": 0.01136035, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.05113339, "balance_loss_mlp": 1.02528572, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 1.9842742541243525, "language_loss": 0.73507619, "learning_rate": 2.3380016406436984e-06, "loss": 0.75676924, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.5452163219451904 }, { "auxiliary_loss_clip": 0.01122415, "auxiliary_loss_mlp": 0.01027074, "balance_loss_clip": 1.05018234, "balance_loss_mlp": 1.01945007, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 1.854488492573768, "language_loss": 0.81254613, "learning_rate": 2.337233848176524e-06, "loss": 0.83404106, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.6263160705566406 }, { "auxiliary_loss_clip": 0.01114858, "auxiliary_loss_mlp": 0.01029851, "balance_loss_clip": 1.04478276, "balance_loss_mlp": 1.02151215, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 3.887915044141574, "language_loss": 0.83574039, "learning_rate": 2.3364660045512435e-06, "loss": 0.85718751, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.5974502563476562 }, { "auxiliary_loss_clip": 0.01047175, "auxiliary_loss_mlp": 0.01003698, "balance_loss_clip": 1.01627493, "balance_loss_mlp": 1.00236845, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 0.735644602631247, "language_loss": 0.58212018, "learning_rate": 2.335698109884337e-06, "loss": 0.60262895, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.263976573944092 }, { "auxiliary_loss_clip": 0.01028483, "auxiliary_loss_mlp": 0.01002967, "balance_loss_clip": 1.01904535, "balance_loss_mlp": 1.00165558, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7835813656838914, "language_loss": 0.59870625, "learning_rate": 2.334930164292294e-06, "loss": 0.61902076, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.2982001304626465 }, { "auxiliary_loss_clip": 0.01120005, "auxiliary_loss_mlp": 0.01028392, "balance_loss_clip": 1.04660332, "balance_loss_mlp": 1.02034557, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 2.6162478676432075, "language_loss": 0.80263382, "learning_rate": 2.334162167891612e-06, "loss": 0.82411778, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.5861377716064453 }, { "auxiliary_loss_clip": 0.01154361, "auxiliary_loss_mlp": 0.01027958, "balance_loss_clip": 1.04895639, "balance_loss_mlp": 1.01970232, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.020150749778654, "language_loss": 0.7513212, "learning_rate": 2.333394120798795e-06, "loss": 0.77314442, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.533970832824707 }, { "auxiliary_loss_clip": 0.01154735, "auxiliary_loss_mlp": 0.01030445, "balance_loss_clip": 1.04984856, "balance_loss_mlp": 1.02265453, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 2.9327089702818276, "language_loss": 0.72339869, "learning_rate": 2.3326260231303545e-06, "loss": 0.74525052, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.5625243186950684 }, { "auxiliary_loss_clip": 0.01181348, "auxiliary_loss_mlp": 0.01029292, "balance_loss_clip": 1.05629075, "balance_loss_mlp": 1.02116251, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 1.6820933465103396, "language_loss": 0.86562353, "learning_rate": 2.331857875002811e-06, "loss": 0.88772988, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 3.26835298538208 }, { "auxiliary_loss_clip": 0.01156371, "auxiliary_loss_mlp": 0.01037177, "balance_loss_clip": 1.05407727, "balance_loss_mlp": 1.02972639, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 1.6209306286721221, "language_loss": 0.76182193, "learning_rate": 2.3310896765326916e-06, "loss": 0.78375739, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 2.6177046298980713 }, { "auxiliary_loss_clip": 0.01132306, "auxiliary_loss_mlp": 0.01032283, "balance_loss_clip": 1.04655886, "balance_loss_mlp": 1.02448034, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 1.7482383253248859, "language_loss": 0.83774626, "learning_rate": 2.330321427836531e-06, "loss": 0.85939217, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.668135643005371 }, { "auxiliary_loss_clip": 0.01165068, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.0520407, "balance_loss_mlp": 1.01901591, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 1.7030701615808672, "language_loss": 0.82673222, "learning_rate": 2.3295531290308733e-06, "loss": 0.8486523, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.5214805603027344 }, { "auxiliary_loss_clip": 0.01184065, "auxiliary_loss_mlp": 0.00762034, "balance_loss_clip": 1.05560243, "balance_loss_mlp": 1.00020409, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 3.061230269254775, "language_loss": 0.7583459, "learning_rate": 2.3287847802322678e-06, "loss": 0.77780688, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 2.4616923332214355 }, { "auxiliary_loss_clip": 0.01156663, "auxiliary_loss_mlp": 0.01032126, "balance_loss_clip": 1.05344009, "balance_loss_mlp": 1.02259493, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 1.6102642622424008, "language_loss": 0.84160447, "learning_rate": 2.3280163815572723e-06, "loss": 0.86349237, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 2.575424909591675 }, { "auxiliary_loss_clip": 0.01145241, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.05088365, "balance_loss_mlp": 1.02759552, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 1.9479080923588787, "language_loss": 0.77277839, "learning_rate": 2.3272479331224522e-06, "loss": 0.7945888, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 4.074430227279663 }, { "auxiliary_loss_clip": 0.01184241, "auxiliary_loss_mlp": 0.0102881, "balance_loss_clip": 1.0551939, "balance_loss_mlp": 1.02019763, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 1.5040459854793349, "language_loss": 0.77775913, "learning_rate": 2.3264794350443817e-06, "loss": 0.79988968, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 3.3948593139648438 }, { "auxiliary_loss_clip": 0.01166774, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.04932857, "balance_loss_mlp": 1.02095485, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 1.8656347976096819, "language_loss": 0.78355986, "learning_rate": 2.3257108874396396e-06, "loss": 0.80551744, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.559690237045288 }, { "auxiliary_loss_clip": 0.01153099, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.0503372, "balance_loss_mlp": 1.02258539, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 1.9078937896235535, "language_loss": 0.73988312, "learning_rate": 2.3249422904248152e-06, "loss": 0.76171887, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 2.539729356765747 }, { "auxiliary_loss_clip": 0.01170326, "auxiliary_loss_mlp": 0.01031704, "balance_loss_clip": 1.05176425, "balance_loss_mlp": 1.02340722, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 1.396668412005556, "language_loss": 0.87376034, "learning_rate": 2.324173644116504e-06, "loss": 0.89578062, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.576395273208618 }, { "auxiliary_loss_clip": 0.01166155, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.05370855, "balance_loss_mlp": 1.02443874, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 1.6024242067384387, "language_loss": 0.8130765, "learning_rate": 2.3234049486313087e-06, "loss": 0.83506507, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 2.5759479999542236 }, { "auxiliary_loss_clip": 0.01169408, "auxiliary_loss_mlp": 0.01030055, "balance_loss_clip": 1.05458808, "balance_loss_mlp": 1.02262187, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 1.6703055656197032, "language_loss": 0.75951242, "learning_rate": 2.322636204085839e-06, "loss": 0.78150707, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.6413400173187256 }, { "auxiliary_loss_clip": 0.01145257, "auxiliary_loss_mlp": 0.01033367, "balance_loss_clip": 1.04784262, "balance_loss_mlp": 1.02516568, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 2.4902576683788293, "language_loss": 0.78843534, "learning_rate": 2.3218674105967143e-06, "loss": 0.81022155, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.5719077587127686 }, { "auxiliary_loss_clip": 0.01145195, "auxiliary_loss_mlp": 0.01027638, "balance_loss_clip": 1.04978371, "balance_loss_mlp": 1.01969254, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 1.571348074885005, "language_loss": 0.83407432, "learning_rate": 2.3210985682805593e-06, "loss": 0.85580266, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.604243516921997 }, { "auxiliary_loss_clip": 0.01180655, "auxiliary_loss_mlp": 0.01032471, "balance_loss_clip": 1.05503464, "balance_loss_mlp": 1.02449846, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 2.288223247883593, "language_loss": 0.67969233, "learning_rate": 2.320329677254007e-06, "loss": 0.70182353, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.44983172416687 }, { "auxiliary_loss_clip": 0.01182752, "auxiliary_loss_mlp": 0.01028868, "balance_loss_clip": 1.05563772, "balance_loss_mlp": 1.02067196, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 2.373526451628856, "language_loss": 0.72720945, "learning_rate": 2.319560737633697e-06, "loss": 0.74932563, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.4718477725982666 }, { "auxiliary_loss_clip": 0.01142748, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.04659867, "balance_loss_mlp": 1.02585256, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 1.8333824291975287, "language_loss": 0.67830509, "learning_rate": 2.3187917495362775e-06, "loss": 0.70007324, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.7449910640716553 }, { "auxiliary_loss_clip": 0.01123195, "auxiliary_loss_mlp": 0.01038279, "balance_loss_clip": 1.04899788, "balance_loss_mlp": 1.0303638, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 2.718375067912322, "language_loss": 0.76879859, "learning_rate": 2.318022713078403e-06, "loss": 0.79041338, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.5777699947357178 }, { "auxiliary_loss_clip": 0.01149723, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.04953873, "balance_loss_mlp": 1.02126372, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 2.2555585599478487, "language_loss": 0.85335803, "learning_rate": 2.3172536283767354e-06, "loss": 0.87514722, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 2.503894329071045 }, { "auxiliary_loss_clip": 0.01134832, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.05067825, "balance_loss_mlp": 1.02386713, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 2.2452279364130336, "language_loss": 0.81134903, "learning_rate": 2.3164844955479447e-06, "loss": 0.83301926, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.534827709197998 }, { "auxiliary_loss_clip": 0.01132594, "auxiliary_loss_mlp": 0.01027273, "balance_loss_clip": 1.04896784, "balance_loss_mlp": 1.01851118, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 1.7910436498652131, "language_loss": 0.70575571, "learning_rate": 2.3157153147087082e-06, "loss": 0.72735441, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.6902196407318115 }, { "auxiliary_loss_clip": 0.01133269, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.05277824, "balance_loss_mlp": 1.02636063, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.7588504894944452, "language_loss": 0.83163708, "learning_rate": 2.314946085975709e-06, "loss": 0.8533138, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.6133341789245605 }, { "auxiliary_loss_clip": 0.01128548, "auxiliary_loss_mlp": 0.01025281, "balance_loss_clip": 1.04870522, "balance_loss_mlp": 1.01761556, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 2.119438216964303, "language_loss": 0.82493347, "learning_rate": 2.3141768094656393e-06, "loss": 0.84647179, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.608243942260742 }, { "auxiliary_loss_clip": 0.01102321, "auxiliary_loss_mlp": 0.01030093, "balance_loss_clip": 1.04278803, "balance_loss_mlp": 1.02242756, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 2.222217430583525, "language_loss": 0.83117402, "learning_rate": 2.3134074852951966e-06, "loss": 0.85249817, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 2.6582043170928955 }, { "auxiliary_loss_clip": 0.01119769, "auxiliary_loss_mlp": 0.01027718, "balance_loss_clip": 1.04538953, "balance_loss_mlp": 1.01982975, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 1.6155251800985762, "language_loss": 0.77568078, "learning_rate": 2.312638113581088e-06, "loss": 0.79715562, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 2.7272143363952637 }, { "auxiliary_loss_clip": 0.01166962, "auxiliary_loss_mlp": 0.01033986, "balance_loss_clip": 1.05121613, "balance_loss_mlp": 1.02550411, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 2.3547796064904456, "language_loss": 0.78509176, "learning_rate": 2.311868694440027e-06, "loss": 0.80710119, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 3.337514638900757 }, { "auxiliary_loss_clip": 0.01070161, "auxiliary_loss_mlp": 0.0100918, "balance_loss_clip": 1.01617968, "balance_loss_mlp": 1.00783348, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.727560183523074, "language_loss": 0.62505478, "learning_rate": 2.3110992279887323e-06, "loss": 0.64584816, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.198573112487793 }, { "auxiliary_loss_clip": 0.01142617, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.04973936, "balance_loss_mlp": 1.0206542, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 2.1322390358433614, "language_loss": 0.84469318, "learning_rate": 2.310329714343932e-06, "loss": 0.86641067, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 2.5484302043914795 }, { "auxiliary_loss_clip": 0.01148289, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.05179524, "balance_loss_mlp": 1.02249503, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 1.8761355265439668, "language_loss": 0.82099986, "learning_rate": 2.309560153622361e-06, "loss": 0.84278911, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.542273759841919 }, { "auxiliary_loss_clip": 0.01141768, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.05225778, "balance_loss_mlp": 1.02674401, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 2.006038674657599, "language_loss": 0.74236071, "learning_rate": 2.3087905459407602e-06, "loss": 0.76413071, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 2.665985107421875 }, { "auxiliary_loss_clip": 0.01059669, "auxiliary_loss_mlp": 0.01004384, "balance_loss_clip": 1.01644313, "balance_loss_mlp": 1.00301266, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.7928181832434987, "language_loss": 0.62891114, "learning_rate": 2.3080208914158795e-06, "loss": 0.64955163, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 3.1286234855651855 }, { "auxiliary_loss_clip": 0.01148523, "auxiliary_loss_mlp": 0.01031732, "balance_loss_clip": 1.0526526, "balance_loss_mlp": 1.02390289, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.0306475785040248, "language_loss": 0.7225371, "learning_rate": 2.3072511901644753e-06, "loss": 0.74433964, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 4.093341827392578 }, { "auxiliary_loss_clip": 0.01181802, "auxiliary_loss_mlp": 0.01028334, "balance_loss_clip": 1.05630398, "balance_loss_mlp": 1.02094936, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 2.2318956981478046, "language_loss": 0.81010908, "learning_rate": 2.306481442303309e-06, "loss": 0.83221042, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 3.3080251216888428 }, { "auxiliary_loss_clip": 0.01166338, "auxiliary_loss_mlp": 0.01033419, "balance_loss_clip": 1.05092645, "balance_loss_mlp": 1.02476466, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 1.694845992512799, "language_loss": 0.73154181, "learning_rate": 2.3057116479491515e-06, "loss": 0.75353932, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.518341064453125 }, { "auxiliary_loss_clip": 0.01163209, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.04902935, "balance_loss_mlp": 1.02393103, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 1.8973026362773473, "language_loss": 0.76272416, "learning_rate": 2.30494180721878e-06, "loss": 0.78468168, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 2.543682098388672 }, { "auxiliary_loss_clip": 0.01164175, "auxiliary_loss_mlp": 0.01028183, "balance_loss_clip": 1.05160105, "balance_loss_mlp": 1.01975513, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 1.740823265077849, "language_loss": 0.89593226, "learning_rate": 2.3041719202289794e-06, "loss": 0.91785592, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 2.483513832092285 }, { "auxiliary_loss_clip": 0.01169813, "auxiliary_loss_mlp": 0.01034545, "balance_loss_clip": 1.05443358, "balance_loss_mlp": 1.02679658, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 1.713767806777814, "language_loss": 0.80195999, "learning_rate": 2.30340198709654e-06, "loss": 0.82400358, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.5282087326049805 }, { "auxiliary_loss_clip": 0.01154199, "auxiliary_loss_mlp": 0.01031831, "balance_loss_clip": 1.04837322, "balance_loss_mlp": 1.02406406, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 1.8396250235333085, "language_loss": 0.74722648, "learning_rate": 2.3026320079382605e-06, "loss": 0.76908678, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 2.5444350242614746 }, { "auxiliary_loss_clip": 0.01180976, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.05601525, "balance_loss_mlp": 1.02129221, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 1.9798895464621107, "language_loss": 0.76180089, "learning_rate": 2.3018619828709454e-06, "loss": 0.78389883, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.5717382431030273 }, { "auxiliary_loss_clip": 0.01165505, "auxiliary_loss_mlp": 0.00761788, "balance_loss_clip": 1.0550313, "balance_loss_mlp": 1.00018752, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 1.8050647033380942, "language_loss": 0.82000065, "learning_rate": 2.3010919120114084e-06, "loss": 0.83927357, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.52693772315979 }, { "auxiliary_loss_clip": 0.01161918, "auxiliary_loss_mlp": 0.01026811, "balance_loss_clip": 1.0485121, "balance_loss_mlp": 1.01837659, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 2.0693518644685294, "language_loss": 0.66208112, "learning_rate": 2.3003217954764672e-06, "loss": 0.68396848, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.482342481613159 }, { "auxiliary_loss_clip": 0.01165969, "auxiliary_loss_mlp": 0.01030838, "balance_loss_clip": 1.04876697, "balance_loss_mlp": 1.02214134, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 1.59640223895007, "language_loss": 0.78956497, "learning_rate": 2.299551633382949e-06, "loss": 0.81153297, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.5802297592163086 }, { "auxiliary_loss_clip": 0.01146702, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.04998469, "balance_loss_mlp": 1.03057492, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 1.8357212993738075, "language_loss": 0.85143244, "learning_rate": 2.2987814258476854e-06, "loss": 0.87328982, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.531841516494751 }, { "auxiliary_loss_clip": 0.0112491, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.04381132, "balance_loss_mlp": 1.02574348, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 2.217319480354246, "language_loss": 0.68041766, "learning_rate": 2.2980111729875177e-06, "loss": 0.70200813, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.5848748683929443 }, { "auxiliary_loss_clip": 0.01149228, "auxiliary_loss_mlp": 0.01026652, "balance_loss_clip": 1.0512588, "balance_loss_mlp": 1.01851654, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 1.9478627951223486, "language_loss": 0.82462668, "learning_rate": 2.2972408749192917e-06, "loss": 0.84638548, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 2.517995595932007 }, { "auxiliary_loss_clip": 0.01166631, "auxiliary_loss_mlp": 0.00762115, "balance_loss_clip": 1.05607009, "balance_loss_mlp": 1.00016665, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 1.6827975520367686, "language_loss": 0.67046928, "learning_rate": 2.296470531759861e-06, "loss": 0.68975675, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.530261278152466 }, { "auxiliary_loss_clip": 0.01130221, "auxiliary_loss_mlp": 0.01028318, "balance_loss_clip": 1.04549503, "balance_loss_mlp": 1.01994991, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 2.7352540610051905, "language_loss": 0.7970295, "learning_rate": 2.2957001436260866e-06, "loss": 0.81861484, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.5584282875061035 }, { "auxiliary_loss_clip": 0.01150579, "auxiliary_loss_mlp": 0.01030141, "balance_loss_clip": 1.05079937, "balance_loss_mlp": 1.02219021, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.6243092819596887, "language_loss": 0.73037952, "learning_rate": 2.294929710634836e-06, "loss": 0.75218666, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.5465402603149414 }, { "auxiliary_loss_clip": 0.01162254, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.04742312, "balance_loss_mlp": 1.02244198, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 3.567047069823146, "language_loss": 0.61552107, "learning_rate": 2.2941592329029823e-06, "loss": 0.63744736, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.6581130027770996 }, { "auxiliary_loss_clip": 0.01158489, "auxiliary_loss_mlp": 0.01033347, "balance_loss_clip": 1.04820371, "balance_loss_mlp": 1.02488303, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 1.838903502739008, "language_loss": 0.78900576, "learning_rate": 2.2933887105474067e-06, "loss": 0.81092405, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.5137081146240234 }, { "auxiliary_loss_clip": 0.01164159, "auxiliary_loss_mlp": 0.01026212, "balance_loss_clip": 1.05430579, "balance_loss_mlp": 1.01901197, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.5572191279671832, "language_loss": 0.81063116, "learning_rate": 2.2926181436849974e-06, "loss": 0.83253485, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.506103992462158 }, { "auxiliary_loss_clip": 0.01166203, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.05250573, "balance_loss_mlp": 1.0205977, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.563402698540988, "language_loss": 0.72724342, "learning_rate": 2.2918475324326478e-06, "loss": 0.74919373, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 3.364996910095215 }, { "auxiliary_loss_clip": 0.01169787, "auxiliary_loss_mlp": 0.00762393, "balance_loss_clip": 1.05277348, "balance_loss_mlp": 1.00017929, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 1.9375476284409454, "language_loss": 0.90829891, "learning_rate": 2.2910768769072603e-06, "loss": 0.92762077, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.5534403324127197 }, { "auxiliary_loss_clip": 0.01160707, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.0519259, "balance_loss_mlp": 1.02862597, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 1.8554245246851973, "language_loss": 0.75986612, "learning_rate": 2.2903061772257417e-06, "loss": 0.78184313, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 2.490959405899048 }, { "auxiliary_loss_clip": 0.01164989, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.05202198, "balance_loss_mlp": 1.01993203, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 1.5129640812035055, "language_loss": 0.78246135, "learning_rate": 2.289535433505007e-06, "loss": 0.80439115, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.5463273525238037 }, { "auxiliary_loss_clip": 0.01151413, "auxiliary_loss_mlp": 0.01030139, "balance_loss_clip": 1.04913914, "balance_loss_mlp": 1.02227712, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 1.636889011396557, "language_loss": 0.6361804, "learning_rate": 2.2887646458619767e-06, "loss": 0.65799594, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 2.603100061416626 }, { "auxiliary_loss_clip": 0.01142425, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.04917002, "balance_loss_mlp": 1.02240753, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 2.0021553836547783, "language_loss": 0.76944518, "learning_rate": 2.2879938144135797e-06, "loss": 0.79117572, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 2.5624849796295166 }, { "auxiliary_loss_clip": 0.01132748, "auxiliary_loss_mlp": 0.00761644, "balance_loss_clip": 1.04590869, "balance_loss_mlp": 1.0001657, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 1.5192092177692131, "language_loss": 0.7509439, "learning_rate": 2.2872229392767496e-06, "loss": 0.7698878, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 4.132529973983765 }, { "auxiliary_loss_clip": 0.01168875, "auxiliary_loss_mlp": 0.01032893, "balance_loss_clip": 1.05104733, "balance_loss_mlp": 1.02448249, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 1.7540675212771286, "language_loss": 0.74724478, "learning_rate": 2.286452020568428e-06, "loss": 0.76926249, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 3.252521514892578 }, { "auxiliary_loss_clip": 0.01181644, "auxiliary_loss_mlp": 0.01033029, "balance_loss_clip": 1.05153704, "balance_loss_mlp": 1.02470779, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 2.2520114601687493, "language_loss": 0.73179603, "learning_rate": 2.2856810584055637e-06, "loss": 0.75394285, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 2.4646458625793457 }, { "auxiliary_loss_clip": 0.01166099, "auxiliary_loss_mlp": 0.01027052, "balance_loss_clip": 1.05092239, "balance_loss_mlp": 1.01913071, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.4780928351736522, "language_loss": 0.67742598, "learning_rate": 2.2849100529051085e-06, "loss": 0.69935751, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 2.726388931274414 }, { "auxiliary_loss_clip": 0.01178774, "auxiliary_loss_mlp": 0.0103831, "balance_loss_clip": 1.05240011, "balance_loss_mlp": 1.0309751, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 2.7057552742525353, "language_loss": 0.80281681, "learning_rate": 2.284139004184026e-06, "loss": 0.82498765, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 2.472241163253784 }, { "auxiliary_loss_clip": 0.01178922, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.05321789, "balance_loss_mlp": 1.02791691, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 2.189669136642054, "language_loss": 0.74927092, "learning_rate": 2.2833679123592814e-06, "loss": 0.77141881, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.475940704345703 }, { "auxiliary_loss_clip": 0.01154049, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.05231667, "balance_loss_mlp": 1.02465129, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 1.6228785313046032, "language_loss": 0.63101077, "learning_rate": 2.2825967775478508e-06, "loss": 0.65287811, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.6257011890411377 }, { "auxiliary_loss_clip": 0.01177611, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.05179477, "balance_loss_mlp": 1.02304125, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 1.8486734145210084, "language_loss": 0.83338773, "learning_rate": 2.2818255998667135e-06, "loss": 0.85547686, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.4669673442840576 }, { "auxiliary_loss_clip": 0.01162315, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.05178928, "balance_loss_mlp": 1.02514184, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.5635842243028915, "language_loss": 0.78991145, "learning_rate": 2.2810543794328566e-06, "loss": 0.81186038, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.4861972332000732 }, { "auxiliary_loss_clip": 0.01168633, "auxiliary_loss_mlp": 0.01031298, "balance_loss_clip": 1.05235636, "balance_loss_mlp": 1.02368021, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 3.004643065864307, "language_loss": 0.82328463, "learning_rate": 2.2802831163632735e-06, "loss": 0.84528387, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.5522544384002686 }, { "auxiliary_loss_clip": 0.01112703, "auxiliary_loss_mlp": 0.01031126, "balance_loss_clip": 1.04564762, "balance_loss_mlp": 1.02285922, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 1.6170425431261597, "language_loss": 0.7428416, "learning_rate": 2.279511810774965e-06, "loss": 0.76427996, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.6586432456970215 }, { "auxiliary_loss_clip": 0.01180909, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.0543195, "balance_loss_mlp": 1.02401948, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 1.7469766430134295, "language_loss": 0.71685362, "learning_rate": 2.2787404627849364e-06, "loss": 0.73897839, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.4851696491241455 }, { "auxiliary_loss_clip": 0.01149304, "auxiliary_loss_mlp": 0.01030756, "balance_loss_clip": 1.04878771, "balance_loss_mlp": 1.02307296, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 1.612684360113646, "language_loss": 0.79078019, "learning_rate": 2.277969072510202e-06, "loss": 0.81258082, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.538074493408203 }, { "auxiliary_loss_clip": 0.01151399, "auxiliary_loss_mlp": 0.01027254, "balance_loss_clip": 1.04948401, "balance_loss_mlp": 1.01990521, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.5485895590699696, "language_loss": 0.81604922, "learning_rate": 2.2771976400677803e-06, "loss": 0.83783573, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.538931369781494 }, { "auxiliary_loss_clip": 0.01114525, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.04662442, "balance_loss_mlp": 1.02002609, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 1.692485970786244, "language_loss": 0.78992987, "learning_rate": 2.2764261655746965e-06, "loss": 0.81135511, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 2.603553295135498 }, { "auxiliary_loss_clip": 0.01132171, "auxiliary_loss_mlp": 0.01032791, "balance_loss_clip": 1.04592681, "balance_loss_mlp": 1.02417803, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 1.556848264405677, "language_loss": 0.75647151, "learning_rate": 2.2756546491479832e-06, "loss": 0.77812111, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.6090219020843506 }, { "auxiliary_loss_clip": 0.01179429, "auxiliary_loss_mlp": 0.00761806, "balance_loss_clip": 1.05232882, "balance_loss_mlp": 1.00017488, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 2.041596191964343, "language_loss": 0.8020885, "learning_rate": 2.274883090904679e-06, "loss": 0.8215009, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.469534397125244 }, { "auxiliary_loss_clip": 0.01183273, "auxiliary_loss_mlp": 0.01036216, "balance_loss_clip": 1.05540276, "balance_loss_mlp": 1.02800214, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 2.232378255633167, "language_loss": 0.67507076, "learning_rate": 2.2741114909618283e-06, "loss": 0.69726562, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.4906420707702637 }, { "auxiliary_loss_clip": 0.01136227, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.04800224, "balance_loss_mlp": 1.0182898, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.6590227297045423, "language_loss": 0.71863639, "learning_rate": 2.2733398494364828e-06, "loss": 0.74025613, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.593660354614258 }, { "auxiliary_loss_clip": 0.01143926, "auxiliary_loss_mlp": 0.01033488, "balance_loss_clip": 1.05067146, "balance_loss_mlp": 1.02590084, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 1.8491693413299166, "language_loss": 0.84393948, "learning_rate": 2.272568166445699e-06, "loss": 0.8657136, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.5517945289611816 }, { "auxiliary_loss_clip": 0.01163522, "auxiliary_loss_mlp": 0.01028343, "balance_loss_clip": 1.04981983, "balance_loss_mlp": 1.0206182, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 1.8267886887332803, "language_loss": 0.64324546, "learning_rate": 2.271796442106541e-06, "loss": 0.66516411, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 3.355485439300537 }, { "auxiliary_loss_clip": 0.01035772, "auxiliary_loss_mlp": 0.01002797, "balance_loss_clip": 1.0136143, "balance_loss_mlp": 1.00149214, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.7978838220985509, "language_loss": 0.56495929, "learning_rate": 2.271024676536079e-06, "loss": 0.58534503, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.1282217502593994 }, { "auxiliary_loss_clip": 0.01155895, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.05285645, "balance_loss_mlp": 1.0225302, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 2.228985135556569, "language_loss": 0.73288155, "learning_rate": 2.2702528698513894e-06, "loss": 0.75475055, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.6323509216308594 }, { "auxiliary_loss_clip": 0.01150695, "auxiliary_loss_mlp": 0.01034604, "balance_loss_clip": 1.04723334, "balance_loss_mlp": 1.02688825, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 1.7967169162338479, "language_loss": 0.78633618, "learning_rate": 2.269481022169554e-06, "loss": 0.80818909, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.647722005844116 }, { "auxiliary_loss_clip": 0.01155327, "auxiliary_loss_mlp": 0.01029941, "balance_loss_clip": 1.04822898, "balance_loss_mlp": 1.02164364, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 1.681395721417986, "language_loss": 0.80698836, "learning_rate": 2.2687091336076614e-06, "loss": 0.82884097, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 2.618433713912964 }, { "auxiliary_loss_clip": 0.01162932, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.05132699, "balance_loss_mlp": 1.02042902, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 1.7405370142244998, "language_loss": 0.80115747, "learning_rate": 2.267937204282807e-06, "loss": 0.82306933, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 2.48189377784729 }, { "auxiliary_loss_clip": 0.01172503, "auxiliary_loss_mlp": 0.01038909, "balance_loss_clip": 1.05408669, "balance_loss_mlp": 1.03039742, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 2.262137563712265, "language_loss": 0.78977883, "learning_rate": 2.2671652343120926e-06, "loss": 0.81189293, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 4.048202037811279 }, { "auxiliary_loss_clip": 0.01181121, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.05631661, "balance_loss_mlp": 1.02448988, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.845085334500575, "language_loss": 0.80522966, "learning_rate": 2.2663932238126236e-06, "loss": 0.82735884, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.5288443565368652 }, { "auxiliary_loss_clip": 0.01160506, "auxiliary_loss_mlp": 0.01033134, "balance_loss_clip": 1.04750323, "balance_loss_mlp": 1.02542675, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 1.4772941871666148, "language_loss": 0.80332726, "learning_rate": 2.265621172901515e-06, "loss": 0.82526368, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 3.367682933807373 }, { "auxiliary_loss_clip": 0.011825, "auxiliary_loss_mlp": 0.01031108, "balance_loss_clip": 1.05643082, "balance_loss_mlp": 1.02335346, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 1.9920965809376945, "language_loss": 0.70915246, "learning_rate": 2.2648490816958854e-06, "loss": 0.73128849, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 2.5433263778686523 }, { "auxiliary_loss_clip": 0.01162805, "auxiliary_loss_mlp": 0.01031769, "balance_loss_clip": 1.04856837, "balance_loss_mlp": 1.02352023, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 2.0759731521740363, "language_loss": 0.73060918, "learning_rate": 2.264076950312861e-06, "loss": 0.75255495, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.532470703125 }, { "auxiliary_loss_clip": 0.01151498, "auxiliary_loss_mlp": 0.01030839, "balance_loss_clip": 1.04935503, "balance_loss_mlp": 1.02327478, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 1.8387574114134273, "language_loss": 0.82601106, "learning_rate": 2.2633047788695727e-06, "loss": 0.84783447, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 2.5444822311401367 }, { "auxiliary_loss_clip": 0.01151438, "auxiliary_loss_mlp": 0.01031708, "balance_loss_clip": 1.053509, "balance_loss_mlp": 1.02447212, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 1.9076267188105778, "language_loss": 0.64342898, "learning_rate": 2.262532567483159e-06, "loss": 0.66526043, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.546480178833008 }, { "auxiliary_loss_clip": 0.01180131, "auxiliary_loss_mlp": 0.00761289, "balance_loss_clip": 1.05452442, "balance_loss_mlp": 1.0001812, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 1.850369250283981, "language_loss": 0.80350924, "learning_rate": 2.2617603162707635e-06, "loss": 0.82292342, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.5039408206939697 }, { "auxiliary_loss_clip": 0.01178933, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.0538367, "balance_loss_mlp": 1.02099919, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 2.219563561605798, "language_loss": 0.8281334, "learning_rate": 2.2609880253495363e-06, "loss": 0.85020781, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.497504472732544 }, { "auxiliary_loss_clip": 0.01142446, "auxiliary_loss_mlp": 0.010273, "balance_loss_clip": 1.04655635, "balance_loss_mlp": 1.01913977, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 1.9542831626803954, "language_loss": 0.86350775, "learning_rate": 2.260215694836633e-06, "loss": 0.88520521, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.579791784286499 }, { "auxiliary_loss_clip": 0.01121982, "auxiliary_loss_mlp": 0.00761169, "balance_loss_clip": 1.04382873, "balance_loss_mlp": 1.00016689, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 1.8404678040867195, "language_loss": 0.64315259, "learning_rate": 2.2594433248492157e-06, "loss": 0.66198415, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.6583266258239746 }, { "auxiliary_loss_clip": 0.01169968, "auxiliary_loss_mlp": 0.01029759, "balance_loss_clip": 1.05209148, "balance_loss_mlp": 1.02173054, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 1.6853420303300406, "language_loss": 0.79790032, "learning_rate": 2.2586709155044527e-06, "loss": 0.81989753, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.54876971244812 }, { "auxiliary_loss_clip": 0.0118089, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.05388212, "balance_loss_mlp": 1.02684951, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 2.312447648838554, "language_loss": 0.76019019, "learning_rate": 2.2578984669195167e-06, "loss": 0.78234684, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.554088592529297 }, { "auxiliary_loss_clip": 0.01162092, "auxiliary_loss_mlp": 0.01029979, "balance_loss_clip": 1.04919946, "balance_loss_mlp": 1.0227077, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.9391014752637712, "language_loss": 0.67957413, "learning_rate": 2.2571259792115887e-06, "loss": 0.70149481, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.6680448055267334 }, { "auxiliary_loss_clip": 0.01159763, "auxiliary_loss_mlp": 0.01026857, "balance_loss_clip": 1.05053854, "balance_loss_mlp": 1.01947165, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 1.639231989016119, "language_loss": 0.79299831, "learning_rate": 2.2563534524978544e-06, "loss": 0.81486458, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 2.533249616622925 }, { "auxiliary_loss_clip": 0.01133074, "auxiliary_loss_mlp": 0.01025517, "balance_loss_clip": 1.05083966, "balance_loss_mlp": 1.0182395, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 1.5925792225058881, "language_loss": 0.70638573, "learning_rate": 2.2555808868955052e-06, "loss": 0.72797161, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.690821409225464 }, { "auxiliary_loss_clip": 0.01121853, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.04829121, "balance_loss_mlp": 1.02013588, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 1.9772267978023321, "language_loss": 0.73416579, "learning_rate": 2.254808282521738e-06, "loss": 0.75566781, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.6859192848205566 }, { "auxiliary_loss_clip": 0.01138808, "auxiliary_loss_mlp": 0.00761118, "balance_loss_clip": 1.04716563, "balance_loss_mlp": 1.00017762, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 1.6719420765735047, "language_loss": 0.8130362, "learning_rate": 2.2540356394937573e-06, "loss": 0.83203542, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.622450113296509 }, { "auxiliary_loss_clip": 0.01140015, "auxiliary_loss_mlp": 0.01026401, "balance_loss_clip": 1.04704762, "balance_loss_mlp": 1.01821756, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 2.160331826371299, "language_loss": 0.84287542, "learning_rate": 2.253262957928772e-06, "loss": 0.86453962, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.5811872482299805 }, { "auxiliary_loss_clip": 0.01142045, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.0449307, "balance_loss_mlp": 1.02541447, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.6487403302546868, "language_loss": 0.71700585, "learning_rate": 2.2524902379439976e-06, "loss": 0.73876083, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.5213100910186768 }, { "auxiliary_loss_clip": 0.01012743, "auxiliary_loss_mlp": 0.01008677, "balance_loss_clip": 1.01643276, "balance_loss_mlp": 1.0072341, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.7399358780206213, "language_loss": 0.63730258, "learning_rate": 2.251717479656655e-06, "loss": 0.65751678, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 4.097006797790527 }, { "auxiliary_loss_clip": 0.01180465, "auxiliary_loss_mlp": 0.01028744, "balance_loss_clip": 1.05373633, "balance_loss_mlp": 1.02110267, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 1.7915878874984799, "language_loss": 0.76179051, "learning_rate": 2.2509446831839704e-06, "loss": 0.78388256, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.6185150146484375 }, { "auxiliary_loss_clip": 0.01152576, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.04839694, "balance_loss_mlp": 1.01924467, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 2.0561947348066494, "language_loss": 0.82206821, "learning_rate": 2.250171848643177e-06, "loss": 0.84386456, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.521286725997925 }, { "auxiliary_loss_clip": 0.01148423, "auxiliary_loss_mlp": 0.01034296, "balance_loss_clip": 1.05215073, "balance_loss_mlp": 1.02659178, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 2.8442630293970743, "language_loss": 0.86016786, "learning_rate": 2.249398976151513e-06, "loss": 0.88199508, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.5312082767486572 }, { "auxiliary_loss_clip": 0.01179057, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.05504215, "balance_loss_mlp": 1.02121794, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 2.086182106234943, "language_loss": 0.78521729, "learning_rate": 2.248626065826223e-06, "loss": 0.80730337, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 2.492340326309204 }, { "auxiliary_loss_clip": 0.01071053, "auxiliary_loss_mlp": 0.01002114, "balance_loss_clip": 1.01742625, "balance_loss_mlp": 1.00067794, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.759086812486289, "language_loss": 0.62603271, "learning_rate": 2.2478531177845564e-06, "loss": 0.6467644, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 3.061748743057251 }, { "auxiliary_loss_clip": 0.01155685, "auxiliary_loss_mlp": 0.01031652, "balance_loss_clip": 1.05315292, "balance_loss_mlp": 1.02368879, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 1.9100977605636438, "language_loss": 0.8522768, "learning_rate": 2.247080132143769e-06, "loss": 0.87415022, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 4.066251516342163 }, { "auxiliary_loss_clip": 0.01135427, "auxiliary_loss_mlp": 0.01029643, "balance_loss_clip": 1.04628849, "balance_loss_mlp": 1.02154887, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 2.2003461394593375, "language_loss": 0.69576621, "learning_rate": 2.246307109021121e-06, "loss": 0.71741688, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.59199595451355 }, { "auxiliary_loss_clip": 0.01144144, "auxiliary_loss_mlp": 0.01034986, "balance_loss_clip": 1.04627824, "balance_loss_mlp": 1.02715135, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.6214443206927447, "language_loss": 0.82301164, "learning_rate": 2.2455340485338817e-06, "loss": 0.84480298, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 3.5031399726867676 }, { "auxiliary_loss_clip": 0.01165322, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.05085135, "balance_loss_mlp": 1.02508664, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 1.771508718412266, "language_loss": 0.67901975, "learning_rate": 2.244760950799322e-06, "loss": 0.70099747, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 2.557223081588745 }, { "auxiliary_loss_clip": 0.01125212, "auxiliary_loss_mlp": 0.01027342, "balance_loss_clip": 1.04864621, "balance_loss_mlp": 1.0201385, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 2.398224335324407, "language_loss": 0.72330874, "learning_rate": 2.2439878159347203e-06, "loss": 0.7448343, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.6182138919830322 }, { "auxiliary_loss_clip": 0.01069775, "auxiliary_loss_mlp": 0.01001497, "balance_loss_clip": 1.01640606, "balance_loss_mlp": 1.0000844, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 1.9717527003684086, "language_loss": 0.55326581, "learning_rate": 2.2432146440573616e-06, "loss": 0.57397848, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.182111978530884 }, { "auxiliary_loss_clip": 0.01149075, "auxiliary_loss_mlp": 0.01027192, "balance_loss_clip": 1.05274391, "balance_loss_mlp": 1.01931798, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 1.8719651561898045, "language_loss": 0.66609597, "learning_rate": 2.242441435284534e-06, "loss": 0.68785858, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 2.579507350921631 }, { "auxiliary_loss_clip": 0.01164238, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.0516454, "balance_loss_mlp": 1.02785099, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 2.6058485311740522, "language_loss": 0.85106385, "learning_rate": 2.2416681897335337e-06, "loss": 0.87306249, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.546626091003418 }, { "auxiliary_loss_clip": 0.01122509, "auxiliary_loss_mlp": 0.01028217, "balance_loss_clip": 1.04949832, "balance_loss_mlp": 1.02015209, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 1.7717607357995335, "language_loss": 0.67126673, "learning_rate": 2.240894907521661e-06, "loss": 0.692774, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.7341911792755127 }, { "auxiliary_loss_clip": 0.01149909, "auxiliary_loss_mlp": 0.01029443, "balance_loss_clip": 1.0495162, "balance_loss_mlp": 1.02166462, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 2.1234857826427653, "language_loss": 0.63661569, "learning_rate": 2.240121588766223e-06, "loss": 0.65840924, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.556644916534424 }, { "auxiliary_loss_clip": 0.01147063, "auxiliary_loss_mlp": 0.01023514, "balance_loss_clip": 1.04932594, "balance_loss_mlp": 1.01643252, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 4.044783973308583, "language_loss": 0.71096683, "learning_rate": 2.239348233584531e-06, "loss": 0.73267257, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.6246495246887207 }, { "auxiliary_loss_clip": 0.01167684, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 1.05257654, "balance_loss_mlp": 1.02463603, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 1.778489207228484, "language_loss": 0.81412137, "learning_rate": 2.2385748420939013e-06, "loss": 0.83612174, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.489778757095337 }, { "auxiliary_loss_clip": 0.01179546, "auxiliary_loss_mlp": 0.01033407, "balance_loss_clip": 1.05774784, "balance_loss_mlp": 1.02578712, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 1.870072484383098, "language_loss": 0.72178924, "learning_rate": 2.2378014144116583e-06, "loss": 0.74391878, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.4951300621032715 }, { "auxiliary_loss_clip": 0.01181846, "auxiliary_loss_mlp": 0.01032158, "balance_loss_clip": 1.05438972, "balance_loss_mlp": 1.0244987, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 1.9253909897351225, "language_loss": 0.79754639, "learning_rate": 2.23702795065513e-06, "loss": 0.81968641, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.50985050201416 }, { "auxiliary_loss_clip": 0.01061374, "auxiliary_loss_mlp": 0.01001495, "balance_loss_clip": 1.0169214, "balance_loss_mlp": 1.00005841, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.977430210277476, "language_loss": 0.6746949, "learning_rate": 2.2362544509416493e-06, "loss": 0.69532359, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 2.9790618419647217 }, { "auxiliary_loss_clip": 0.01145246, "auxiliary_loss_mlp": 0.01027929, "balance_loss_clip": 1.04822421, "balance_loss_mlp": 1.02035332, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 2.768240885639337, "language_loss": 0.82600975, "learning_rate": 2.2354809153885572e-06, "loss": 0.84774148, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.553560495376587 }, { "auxiliary_loss_clip": 0.01163607, "auxiliary_loss_mlp": 0.01032151, "balance_loss_clip": 1.05143261, "balance_loss_mlp": 1.02402043, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 1.8690246954365137, "language_loss": 0.83501577, "learning_rate": 2.234707344113197e-06, "loss": 0.85697341, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.521613121032715 }, { "auxiliary_loss_clip": 0.01176529, "auxiliary_loss_mlp": 0.01026044, "balance_loss_clip": 1.05378509, "balance_loss_mlp": 1.01900196, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 1.6489277543723717, "language_loss": 0.776752, "learning_rate": 2.233933737232919e-06, "loss": 0.7987777, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.466125011444092 }, { "auxiliary_loss_clip": 0.01111844, "auxiliary_loss_mlp": 0.0076095, "balance_loss_clip": 1.04427624, "balance_loss_mlp": 1.00019252, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 1.5473221794891645, "language_loss": 0.78299975, "learning_rate": 2.2331600948650793e-06, "loss": 0.80172765, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.6191110610961914 }, { "auxiliary_loss_clip": 0.01124425, "auxiliary_loss_mlp": 0.00762052, "balance_loss_clip": 1.04793549, "balance_loss_mlp": 1.00017416, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 1.4388350378835197, "language_loss": 0.79908121, "learning_rate": 2.2323864171270386e-06, "loss": 0.81794596, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.6517903804779053 }, { "auxiliary_loss_clip": 0.0113907, "auxiliary_loss_mlp": 0.01028397, "balance_loss_clip": 1.04728818, "balance_loss_mlp": 1.02009392, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 1.949776765138691, "language_loss": 0.72600627, "learning_rate": 2.231612704136164e-06, "loss": 0.7476809, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 3.4202473163604736 }, { "auxiliary_loss_clip": 0.01158696, "auxiliary_loss_mlp": 0.01026233, "balance_loss_clip": 1.04895258, "balance_loss_mlp": 1.01763225, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 4.268952485456957, "language_loss": 0.74631119, "learning_rate": 2.2308389560098253e-06, "loss": 0.76816052, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.5143215656280518 }, { "auxiliary_loss_clip": 0.01136238, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 1.04976749, "balance_loss_mlp": 1.0199964, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 2.827476928955933, "language_loss": 0.7683782, "learning_rate": 2.2300651728654008e-06, "loss": 0.79001629, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 2.5738797187805176 }, { "auxiliary_loss_clip": 0.01058171, "auxiliary_loss_mlp": 0.00751276, "balance_loss_clip": 1.01714182, "balance_loss_mlp": 1.00010014, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 0.7351302843937069, "language_loss": 0.60197484, "learning_rate": 2.229291354820272e-06, "loss": 0.62006938, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.136117935180664 }, { "auxiliary_loss_clip": 0.01161535, "auxiliary_loss_mlp": 0.01034505, "balance_loss_clip": 1.04976618, "balance_loss_mlp": 1.02638721, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 1.981617809978044, "language_loss": 0.75704062, "learning_rate": 2.228517501991828e-06, "loss": 0.77900106, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 2.503363847732544 }, { "auxiliary_loss_clip": 0.01048167, "auxiliary_loss_mlp": 0.01002562, "balance_loss_clip": 1.01621389, "balance_loss_mlp": 1.00117326, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.8063816426794026, "language_loss": 0.61041033, "learning_rate": 2.22774361449746e-06, "loss": 0.63091755, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 3.15537428855896 }, { "auxiliary_loss_clip": 0.01103873, "auxiliary_loss_mlp": 0.01027612, "balance_loss_clip": 1.04462826, "balance_loss_mlp": 1.01975036, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 2.8388554812268874, "language_loss": 0.70909339, "learning_rate": 2.2269696924545668e-06, "loss": 0.73040825, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 4.190083980560303 }, { "auxiliary_loss_clip": 0.01141036, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 1.05404794, "balance_loss_mlp": 1.02288258, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 2.2797506574674467, "language_loss": 0.78162849, "learning_rate": 2.2261957359805523e-06, "loss": 0.80334073, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.55216121673584 }, { "auxiliary_loss_clip": 0.0117966, "auxiliary_loss_mlp": 0.01030247, "balance_loss_clip": 1.05448627, "balance_loss_mlp": 1.02274823, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 3.106254225084741, "language_loss": 0.74114382, "learning_rate": 2.225421745192823e-06, "loss": 0.76324296, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 3.3045389652252197 }, { "auxiliary_loss_clip": 0.01164053, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.05241632, "balance_loss_mlp": 1.0208385, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 1.9796454466552096, "language_loss": 0.78063267, "learning_rate": 2.2246477202087955e-06, "loss": 0.80255961, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 2.6068384647369385 }, { "auxiliary_loss_clip": 0.01148765, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 1.04744458, "balance_loss_mlp": 1.02200198, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 1.5316409024774031, "language_loss": 0.83059919, "learning_rate": 2.223873661145887e-06, "loss": 0.85237926, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.5599682331085205 }, { "auxiliary_loss_clip": 0.01146067, "auxiliary_loss_mlp": 0.00760812, "balance_loss_clip": 1.05174065, "balance_loss_mlp": 1.00016689, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.6193879549098928, "language_loss": 0.71145433, "learning_rate": 2.2230995681215226e-06, "loss": 0.73052317, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.5559229850769043 }, { "auxiliary_loss_clip": 0.01131958, "auxiliary_loss_mlp": 0.01028398, "balance_loss_clip": 1.04652822, "balance_loss_mlp": 1.02088237, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 1.9900032715070965, "language_loss": 0.77919853, "learning_rate": 2.2223254412531305e-06, "loss": 0.80080205, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 2.5612854957580566 }, { "auxiliary_loss_clip": 0.0113526, "auxiliary_loss_mlp": 0.01026685, "balance_loss_clip": 1.04583359, "balance_loss_mlp": 1.01975298, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 1.8612498124672565, "language_loss": 0.82181007, "learning_rate": 2.221551280658146e-06, "loss": 0.84342957, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.5735697746276855 }, { "auxiliary_loss_clip": 0.01119965, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.04767525, "balance_loss_mlp": 1.01991081, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 1.8153344623258607, "language_loss": 0.74120212, "learning_rate": 2.2207770864540085e-06, "loss": 0.76267564, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.6267201900482178 }, { "auxiliary_loss_clip": 0.01143297, "auxiliary_loss_mlp": 0.01026951, "balance_loss_clip": 1.04958999, "balance_loss_mlp": 1.01960206, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 1.7871746016833943, "language_loss": 0.72524142, "learning_rate": 2.220002858758162e-06, "loss": 0.74694383, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.555229663848877 }, { "auxiliary_loss_clip": 0.01060842, "auxiliary_loss_mlp": 0.0100089, "balance_loss_clip": 1.01636076, "balance_loss_mlp": 0.99946576, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.8747779589611542, "language_loss": 0.60924417, "learning_rate": 2.2192285976880573e-06, "loss": 0.62986153, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.1436238288879395 }, { "auxiliary_loss_clip": 0.01136058, "auxiliary_loss_mlp": 0.00760389, "balance_loss_clip": 1.04692721, "balance_loss_mlp": 1.0001905, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.475812490575636, "language_loss": 0.80399537, "learning_rate": 2.2184543033611485e-06, "loss": 0.8229599, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 2.750775098800659 }, { "auxiliary_loss_clip": 0.01165655, "auxiliary_loss_mlp": 0.01030245, "balance_loss_clip": 1.05215633, "balance_loss_mlp": 1.0227828, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.012272104266677, "language_loss": 0.81680131, "learning_rate": 2.2176799758948957e-06, "loss": 0.83876026, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.5892810821533203 }, { "auxiliary_loss_clip": 0.01145912, "auxiliary_loss_mlp": 0.01030518, "balance_loss_clip": 1.05083501, "balance_loss_mlp": 1.02284098, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 2.7804680343752044, "language_loss": 0.73161995, "learning_rate": 2.2169056154067635e-06, "loss": 0.75338423, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.755753517150879 }, { "auxiliary_loss_clip": 0.01164706, "auxiliary_loss_mlp": 0.00761444, "balance_loss_clip": 1.05307865, "balance_loss_mlp": 1.0001719, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 1.6214464525549772, "language_loss": 0.82187021, "learning_rate": 2.216131222014222e-06, "loss": 0.84113169, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 2.5263490676879883 }, { "auxiliary_loss_clip": 0.01132763, "auxiliary_loss_mlp": 0.01031151, "balance_loss_clip": 1.05056143, "balance_loss_mlp": 1.02309871, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 2.2252178630512023, "language_loss": 0.80090868, "learning_rate": 2.2153567958347455e-06, "loss": 0.82254785, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.563032865524292 }, { "auxiliary_loss_clip": 0.01150592, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.0513953, "balance_loss_mlp": 1.02192986, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 2.108983949883118, "language_loss": 0.79924417, "learning_rate": 2.214582336985815e-06, "loss": 0.82104504, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.5239100456237793 }, { "auxiliary_loss_clip": 0.01140299, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.04837453, "balance_loss_mlp": 1.02068257, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 2.1543547897025124, "language_loss": 0.66224939, "learning_rate": 2.2138078455849142e-06, "loss": 0.68394303, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.5297999382019043 }, { "auxiliary_loss_clip": 0.01168726, "auxiliary_loss_mlp": 0.01028633, "balance_loss_clip": 1.05400479, "balance_loss_mlp": 1.02078342, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 1.9603910524836532, "language_loss": 0.78604567, "learning_rate": 2.2130333217495334e-06, "loss": 0.80801928, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.5136220455169678 }, { "auxiliary_loss_clip": 0.01143948, "auxiliary_loss_mlp": 0.01023803, "balance_loss_clip": 1.04816818, "balance_loss_mlp": 1.01555359, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 2.3351533644879954, "language_loss": 0.67210323, "learning_rate": 2.2122587655971665e-06, "loss": 0.69378072, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.5136637687683105 }, { "auxiliary_loss_clip": 0.01150734, "auxiliary_loss_mlp": 0.01029284, "balance_loss_clip": 1.04866862, "balance_loss_mlp": 1.02146971, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.600429004435232, "language_loss": 0.63934451, "learning_rate": 2.211484177245314e-06, "loss": 0.66114467, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 3.331915855407715 }, { "auxiliary_loss_clip": 0.0118332, "auxiliary_loss_mlp": 0.01033481, "balance_loss_clip": 1.05671453, "balance_loss_mlp": 1.02586675, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 2.519162730710436, "language_loss": 0.72179806, "learning_rate": 2.21070955681148e-06, "loss": 0.7439661, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.508892297744751 }, { "auxiliary_loss_clip": 0.01130899, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.04912531, "balance_loss_mlp": 1.0235101, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.5960944280797, "language_loss": 0.78454077, "learning_rate": 2.209934904413174e-06, "loss": 0.80616188, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 2.6428723335266113 }, { "auxiliary_loss_clip": 0.01099578, "auxiliary_loss_mlp": 0.01028502, "balance_loss_clip": 1.03646159, "balance_loss_mlp": 1.02073538, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 2.698081441431407, "language_loss": 0.71742678, "learning_rate": 2.2091602201679095e-06, "loss": 0.7387076, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.668179750442505 }, { "auxiliary_loss_clip": 0.01142136, "auxiliary_loss_mlp": 0.01028103, "balance_loss_clip": 1.05055273, "balance_loss_mlp": 1.02077174, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.6054109924025974, "language_loss": 0.83522624, "learning_rate": 2.208385504193206e-06, "loss": 0.85692859, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 2.577590227127075 }, { "auxiliary_loss_clip": 0.01179164, "auxiliary_loss_mlp": 0.01034312, "balance_loss_clip": 1.05312657, "balance_loss_mlp": 1.0264976, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 2.632757901985482, "language_loss": 0.80917168, "learning_rate": 2.2076107566065873e-06, "loss": 0.8313064, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 3.2712457180023193 }, { "auxiliary_loss_clip": 0.01163751, "auxiliary_loss_mlp": 0.01027248, "balance_loss_clip": 1.05134606, "balance_loss_mlp": 1.01972044, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 2.8871880787574415, "language_loss": 0.75517869, "learning_rate": 2.2068359775255816e-06, "loss": 0.7770887, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 3.335500478744507 }, { "auxiliary_loss_clip": 0.01115586, "auxiliary_loss_mlp": 0.01024103, "balance_loss_clip": 1.04447579, "balance_loss_mlp": 1.01703119, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 2.600462468368549, "language_loss": 0.78452688, "learning_rate": 2.206061167067723e-06, "loss": 0.80592376, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.616544008255005 }, { "auxiliary_loss_clip": 0.01132228, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.04560328, "balance_loss_mlp": 1.02524459, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 2.2237475934678677, "language_loss": 0.79455584, "learning_rate": 2.205286325350549e-06, "loss": 0.81621069, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 3.404973030090332 }, { "auxiliary_loss_clip": 0.01123015, "auxiliary_loss_mlp": 0.01023122, "balance_loss_clip": 1.0470891, "balance_loss_mlp": 1.01544762, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 2.028423215309233, "language_loss": 0.72353923, "learning_rate": 2.204511452491603e-06, "loss": 0.7450006, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 2.5949995517730713 }, { "auxiliary_loss_clip": 0.01176376, "auxiliary_loss_mlp": 0.01025069, "balance_loss_clip": 1.05371487, "balance_loss_mlp": 1.01763046, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 2.1442901975976203, "language_loss": 0.74869215, "learning_rate": 2.2037365486084316e-06, "loss": 0.77070665, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.69012451171875 }, { "auxiliary_loss_clip": 0.0114027, "auxiliary_loss_mlp": 0.010277, "balance_loss_clip": 1.0456481, "balance_loss_mlp": 1.01982319, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 1.9142871364194984, "language_loss": 0.77770281, "learning_rate": 2.2029616138185886e-06, "loss": 0.79938257, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.643929958343506 }, { "auxiliary_loss_clip": 0.01134375, "auxiliary_loss_mlp": 0.01029914, "balance_loss_clip": 1.05152273, "balance_loss_mlp": 1.02279437, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 1.6174966182640038, "language_loss": 0.82555622, "learning_rate": 2.202186648239629e-06, "loss": 0.84719908, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.62048602104187 }, { "auxiliary_loss_clip": 0.0115994, "auxiliary_loss_mlp": 0.01031759, "balance_loss_clip": 1.05097759, "balance_loss_mlp": 1.02420735, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 1.6498759540523387, "language_loss": 0.71230006, "learning_rate": 2.201411651989117e-06, "loss": 0.73421705, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 2.59281063079834 }, { "auxiliary_loss_clip": 0.01148538, "auxiliary_loss_mlp": 0.0076175, "balance_loss_clip": 1.05190039, "balance_loss_mlp": 1.00017858, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 1.8402177395924462, "language_loss": 0.78306925, "learning_rate": 2.2006366251846167e-06, "loss": 0.80217218, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.624256134033203 }, { "auxiliary_loss_clip": 0.0115027, "auxiliary_loss_mlp": 0.01031118, "balance_loss_clip": 1.05340743, "balance_loss_mlp": 1.02426982, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 1.7143266876349377, "language_loss": 0.7562052, "learning_rate": 2.1998615679436997e-06, "loss": 0.77801907, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.5462794303894043 }, { "auxiliary_loss_clip": 0.01154383, "auxiliary_loss_mlp": 0.01032655, "balance_loss_clip": 1.04938698, "balance_loss_mlp": 1.02521586, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 3.2010503417048475, "language_loss": 0.77529085, "learning_rate": 2.199086480383942e-06, "loss": 0.79716128, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.6159818172454834 }, { "auxiliary_loss_clip": 0.01161395, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.0515337, "balance_loss_mlp": 1.02512407, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 3.055089041942076, "language_loss": 0.67505139, "learning_rate": 2.1983113626229234e-06, "loss": 0.69700474, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.6492321491241455 }, { "auxiliary_loss_clip": 0.01127701, "auxiliary_loss_mlp": 0.00761316, "balance_loss_clip": 1.04470658, "balance_loss_mlp": 1.00017357, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 1.6573761412155803, "language_loss": 0.78257048, "learning_rate": 2.1975362147782293e-06, "loss": 0.80146062, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.5921969413757324 }, { "auxiliary_loss_clip": 0.01043839, "auxiliary_loss_mlp": 0.01004685, "balance_loss_clip": 1.01488161, "balance_loss_mlp": 1.00338602, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.6908391657320504, "language_loss": 0.54187286, "learning_rate": 2.196761036967448e-06, "loss": 0.56235814, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.257030487060547 }, { "auxiliary_loss_clip": 0.01160573, "auxiliary_loss_mlp": 0.01033433, "balance_loss_clip": 1.05137587, "balance_loss_mlp": 1.02609241, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 1.8272301791107664, "language_loss": 0.77643198, "learning_rate": 2.1959858293081743e-06, "loss": 0.79837203, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 2.5692660808563232 }, { "auxiliary_loss_clip": 0.01133803, "auxiliary_loss_mlp": 0.01041287, "balance_loss_clip": 1.05005598, "balance_loss_mlp": 1.03341889, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 1.6055250886791677, "language_loss": 0.75976408, "learning_rate": 2.1952105919180056e-06, "loss": 0.781515, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.578372001647949 }, { "auxiliary_loss_clip": 0.01149847, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.05311418, "balance_loss_mlp": 1.02529001, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 2.349521395535041, "language_loss": 0.67889404, "learning_rate": 2.1944353249145456e-06, "loss": 0.70072675, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.579235792160034 }, { "auxiliary_loss_clip": 0.01179153, "auxiliary_loss_mlp": 0.01029148, "balance_loss_clip": 1.0559746, "balance_loss_mlp": 1.02179539, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 1.6261506679807713, "language_loss": 0.74237657, "learning_rate": 2.193660028415401e-06, "loss": 0.76445955, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.5190722942352295 }, { "auxiliary_loss_clip": 0.0114249, "auxiliary_loss_mlp": 0.01031383, "balance_loss_clip": 1.04914093, "balance_loss_mlp": 1.02362275, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 1.722517719795405, "language_loss": 0.82385945, "learning_rate": 2.1928847025381852e-06, "loss": 0.8455981, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.588080644607544 }, { "auxiliary_loss_clip": 0.011608, "auxiliary_loss_mlp": 0.01031167, "balance_loss_clip": 1.0478096, "balance_loss_mlp": 1.02326345, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.5252970715865397, "language_loss": 0.83826762, "learning_rate": 2.192109347400512e-06, "loss": 0.86018729, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.5357491970062256 }, { "auxiliary_loss_clip": 0.0115259, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.04936695, "balance_loss_mlp": 1.02652144, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 1.7408563411096505, "language_loss": 0.78937685, "learning_rate": 2.191333963120004e-06, "loss": 0.81125355, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 3.404409885406494 }, { "auxiliary_loss_clip": 0.01149877, "auxiliary_loss_mlp": 0.01029875, "balance_loss_clip": 1.05166602, "balance_loss_mlp": 1.02238822, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 2.4861297603864636, "language_loss": 0.70156872, "learning_rate": 2.190558549814286e-06, "loss": 0.72336626, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.597511053085327 }, { "auxiliary_loss_clip": 0.01144261, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.04641986, "balance_loss_mlp": 1.02795005, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.737694827150912, "language_loss": 0.79473281, "learning_rate": 2.1897831076009872e-06, "loss": 0.81653011, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 2.5654983520507812 }, { "auxiliary_loss_clip": 0.0116489, "auxiliary_loss_mlp": 0.01027156, "balance_loss_clip": 1.05261707, "balance_loss_mlp": 1.01962805, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 1.6355243623738087, "language_loss": 0.7973761, "learning_rate": 2.1890076365977426e-06, "loss": 0.8192966, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.539324998855591 }, { "auxiliary_loss_clip": 0.01048204, "auxiliary_loss_mlp": 0.01002121, "balance_loss_clip": 1.01729321, "balance_loss_mlp": 1.00082195, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.858785060791233, "language_loss": 0.52764708, "learning_rate": 2.188232136922189e-06, "loss": 0.5481503, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 3.053109884262085 }, { "auxiliary_loss_clip": 0.01099899, "auxiliary_loss_mlp": 0.01026392, "balance_loss_clip": 1.04441988, "balance_loss_mlp": 1.01746917, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 1.8051528523886142, "language_loss": 0.75476456, "learning_rate": 2.187456608691971e-06, "loss": 0.77602744, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 3.4495980739593506 }, { "auxiliary_loss_clip": 0.01140106, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.05177724, "balance_loss_mlp": 1.0268898, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 2.190687452590082, "language_loss": 0.87439227, "learning_rate": 2.1866810520247334e-06, "loss": 0.89614081, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 3.3202764987945557 }, { "auxiliary_loss_clip": 0.0116759, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.05128634, "balance_loss_mlp": 1.02531874, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 1.7056525803082725, "language_loss": 0.64610386, "learning_rate": 2.185905467038129e-06, "loss": 0.66811144, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 2.564095973968506 }, { "auxiliary_loss_clip": 0.01175131, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.05414701, "balance_loss_mlp": 1.02281344, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 1.6522589853284468, "language_loss": 0.77471197, "learning_rate": 2.1851298538498127e-06, "loss": 0.7967661, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 3.2632195949554443 }, { "auxiliary_loss_clip": 0.01170786, "auxiliary_loss_mlp": 0.00762248, "balance_loss_clip": 1.05473697, "balance_loss_mlp": 1.00017881, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 1.9718942086976732, "language_loss": 0.80125046, "learning_rate": 2.184354212577446e-06, "loss": 0.82058084, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 2.567845106124878 }, { "auxiliary_loss_clip": 0.01179402, "auxiliary_loss_mlp": 0.01023495, "balance_loss_clip": 1.05189753, "balance_loss_mlp": 1.01557994, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 2.6059000385717512, "language_loss": 0.624017, "learning_rate": 2.1835785433386907e-06, "loss": 0.64604604, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.4632349014282227 }, { "auxiliary_loss_clip": 0.01128825, "auxiliary_loss_mlp": 0.01033269, "balance_loss_clip": 1.04939997, "balance_loss_mlp": 1.02525806, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.8593704666182245, "language_loss": 0.65319127, "learning_rate": 2.182802846251216e-06, "loss": 0.6748122, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.6029865741729736 }, { "auxiliary_loss_clip": 0.01140859, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 1.04788971, "balance_loss_mlp": 1.02032661, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 1.7979211317055557, "language_loss": 0.72281069, "learning_rate": 2.182027121432696e-06, "loss": 0.74449897, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.724167585372925 }, { "auxiliary_loss_clip": 0.01182192, "auxiliary_loss_mlp": 0.01027452, "balance_loss_clip": 1.05471921, "balance_loss_mlp": 1.01927471, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 2.1716943160475806, "language_loss": 0.82376295, "learning_rate": 2.1812513690008054e-06, "loss": 0.84585941, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.488213300704956 }, { "auxiliary_loss_clip": 0.01172095, "auxiliary_loss_mlp": 0.01033539, "balance_loss_clip": 1.05360746, "balance_loss_mlp": 1.0255816, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.0672555592850683, "language_loss": 0.79591835, "learning_rate": 2.180475589073227e-06, "loss": 0.81797469, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 2.5021560192108154 }, { "auxiliary_loss_clip": 0.01153007, "auxiliary_loss_mlp": 0.01023362, "balance_loss_clip": 1.04803586, "balance_loss_mlp": 1.01601255, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.7298100598468127, "language_loss": 0.73434186, "learning_rate": 2.1796997817676456e-06, "loss": 0.75610554, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.5463407039642334 }, { "auxiliary_loss_clip": 0.01165575, "auxiliary_loss_mlp": 0.00760735, "balance_loss_clip": 1.05238986, "balance_loss_mlp": 1.00019836, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 1.4914326274598346, "language_loss": 0.67723584, "learning_rate": 2.1789239472017494e-06, "loss": 0.69649893, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.580043315887451 }, { "auxiliary_loss_clip": 0.01133971, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.0473249, "balance_loss_mlp": 1.02091217, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 3.161445088248908, "language_loss": 0.73164916, "learning_rate": 2.1781480854932326e-06, "loss": 0.7532807, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.5829977989196777 }, { "auxiliary_loss_clip": 0.011225, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.04971468, "balance_loss_mlp": 1.02728975, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 3.171770037107228, "language_loss": 0.79121721, "learning_rate": 2.1773721967597933e-06, "loss": 0.8127898, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.612440824508667 }, { "auxiliary_loss_clip": 0.01046018, "auxiliary_loss_mlp": 0.01005522, "balance_loss_clip": 1.01710224, "balance_loss_mlp": 1.00432444, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 0.8445793526804394, "language_loss": 0.57393217, "learning_rate": 2.1765962811191322e-06, "loss": 0.59444761, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 3.1014516353607178 }, { "auxiliary_loss_clip": 0.01027347, "auxiliary_loss_mlp": 0.01006564, "balance_loss_clip": 1.02022934, "balance_loss_mlp": 1.00532424, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.824701256285342, "language_loss": 0.62040156, "learning_rate": 2.1758203386889566e-06, "loss": 0.64074063, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.2202255725860596 }, { "auxiliary_loss_clip": 0.01136105, "auxiliary_loss_mlp": 0.00761818, "balance_loss_clip": 1.04919529, "balance_loss_mlp": 1.00019717, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 3.1523159821456974, "language_loss": 0.84610164, "learning_rate": 2.1750443695869746e-06, "loss": 0.86508095, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.578232765197754 }, { "auxiliary_loss_clip": 0.01164965, "auxiliary_loss_mlp": 0.01026022, "balance_loss_clip": 1.04996061, "balance_loss_mlp": 1.01817846, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 1.711206553629646, "language_loss": 0.86037219, "learning_rate": 2.174268373930901e-06, "loss": 0.88228208, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.5200185775756836 }, { "auxiliary_loss_clip": 0.01131583, "auxiliary_loss_mlp": 0.00761604, "balance_loss_clip": 1.05027783, "balance_loss_mlp": 1.00018251, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 2.5952456997432423, "language_loss": 0.80133331, "learning_rate": 2.1734923518384537e-06, "loss": 0.82026529, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.5693461894989014 }, { "auxiliary_loss_clip": 0.01122102, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.0484271, "balance_loss_mlp": 1.02061093, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 1.6926019891895128, "language_loss": 0.82130969, "learning_rate": 2.1727163034273547e-06, "loss": 0.84281313, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.647096633911133 }, { "auxiliary_loss_clip": 0.01163208, "auxiliary_loss_mlp": 0.01030062, "balance_loss_clip": 1.04881108, "balance_loss_mlp": 1.02252197, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 2.0396234510720466, "language_loss": 0.78861111, "learning_rate": 2.17194022881533e-06, "loss": 0.81054378, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.48291015625 }, { "auxiliary_loss_clip": 0.0115475, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.05089664, "balance_loss_mlp": 1.02176666, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 1.6680653289736693, "language_loss": 0.67816854, "learning_rate": 2.1711641281201092e-06, "loss": 0.70002091, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 3.4254682064056396 }, { "auxiliary_loss_clip": 0.0116629, "auxiliary_loss_mlp": 0.01027764, "balance_loss_clip": 1.05490041, "balance_loss_mlp": 1.0202719, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.046741381279861, "language_loss": 0.79160511, "learning_rate": 2.1703880014594264e-06, "loss": 0.8135457, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.5121071338653564 }, { "auxiliary_loss_clip": 0.01113182, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.04750955, "balance_loss_mlp": 1.02278185, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 1.6029144081947768, "language_loss": 0.73953438, "learning_rate": 2.1696118489510182e-06, "loss": 0.76096517, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 2.6652181148529053 }, { "auxiliary_loss_clip": 0.01142949, "auxiliary_loss_mlp": 0.00761637, "balance_loss_clip": 1.04977119, "balance_loss_mlp": 1.00019574, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 2.386706233509167, "language_loss": 0.72898239, "learning_rate": 2.1688356707126286e-06, "loss": 0.74802822, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.5987792015075684 }, { "auxiliary_loss_clip": 0.01133116, "auxiliary_loss_mlp": 0.01031154, "balance_loss_clip": 1.05069959, "balance_loss_mlp": 1.02292848, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 1.9193284996521307, "language_loss": 0.70219517, "learning_rate": 2.168059466862001e-06, "loss": 0.72383785, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 3.328705072402954 }, { "auxiliary_loss_clip": 0.01150955, "auxiliary_loss_mlp": 0.01029577, "balance_loss_clip": 1.04695225, "balance_loss_mlp": 1.02201295, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 3.5982188518597464, "language_loss": 0.8180114, "learning_rate": 2.167283237516887e-06, "loss": 0.83981681, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 2.5494580268859863 }, { "auxiliary_loss_clip": 0.01154082, "auxiliary_loss_mlp": 0.01028456, "balance_loss_clip": 1.0510788, "balance_loss_mlp": 1.02071309, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 1.7084735013652907, "language_loss": 0.74400461, "learning_rate": 2.1665069827950383e-06, "loss": 0.76583004, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 3.2794036865234375 }, { "auxiliary_loss_clip": 0.01146666, "auxiliary_loss_mlp": 0.01031064, "balance_loss_clip": 1.04964399, "balance_loss_mlp": 1.02381277, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 1.6451703315750443, "language_loss": 0.86808717, "learning_rate": 2.1657307028142126e-06, "loss": 0.88986444, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 2.5219714641571045 }, { "auxiliary_loss_clip": 0.01151952, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.05118108, "balance_loss_mlp": 1.02517605, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 2.0168145259010464, "language_loss": 0.66825962, "learning_rate": 2.164954397692171e-06, "loss": 0.69011307, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 3.439969301223755 }, { "auxiliary_loss_clip": 0.0105634, "auxiliary_loss_mlp": 0.01002605, "balance_loss_clip": 1.02097774, "balance_loss_mlp": 1.00130582, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.1364450998683144, "language_loss": 0.77349752, "learning_rate": 2.164178067546678e-06, "loss": 0.79408693, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.199843168258667 }, { "auxiliary_loss_clip": 0.0115469, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.04896259, "balance_loss_mlp": 1.01704502, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 1.739137066606048, "language_loss": 0.90505004, "learning_rate": 2.163401712495504e-06, "loss": 0.92684484, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.5418434143066406 }, { "auxiliary_loss_clip": 0.01119971, "auxiliary_loss_mlp": 0.010329, "balance_loss_clip": 1.04709065, "balance_loss_mlp": 1.02531266, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 2.1651824183629316, "language_loss": 0.79402113, "learning_rate": 2.1626253326564194e-06, "loss": 0.81554985, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 2.621030569076538 }, { "auxiliary_loss_clip": 0.01147775, "auxiliary_loss_mlp": 0.01031786, "balance_loss_clip": 1.04741073, "balance_loss_mlp": 1.02379942, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 1.6489024674091168, "language_loss": 0.76898026, "learning_rate": 2.161848928147201e-06, "loss": 0.7907759, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 2.6019067764282227 }, { "auxiliary_loss_clip": 0.01162701, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.05317283, "balance_loss_mlp": 1.01938105, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 2.089728574454659, "language_loss": 0.80730718, "learning_rate": 2.161072499085629e-06, "loss": 0.82920659, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.5156238079071045 }, { "auxiliary_loss_clip": 0.01140122, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.0511198, "balance_loss_mlp": 1.02402377, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 1.5373179364288327, "language_loss": 0.82935333, "learning_rate": 2.160296045589487e-06, "loss": 0.85107172, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.663034677505493 }, { "auxiliary_loss_clip": 0.01161073, "auxiliary_loss_mlp": 0.01027839, "balance_loss_clip": 1.0515933, "balance_loss_mlp": 1.01981616, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 1.7379484858740346, "language_loss": 0.69796538, "learning_rate": 2.159519567776562e-06, "loss": 0.71985447, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 2.5077149868011475 }, { "auxiliary_loss_clip": 0.01122129, "auxiliary_loss_mlp": 0.01031386, "balance_loss_clip": 1.04398012, "balance_loss_mlp": 1.02318454, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 2.4493201324949943, "language_loss": 0.71230358, "learning_rate": 2.1587430657646463e-06, "loss": 0.73383868, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.6034135818481445 }, { "auxiliary_loss_clip": 0.01150514, "auxiliary_loss_mlp": 0.01032826, "balance_loss_clip": 1.05409706, "balance_loss_mlp": 1.02527952, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 1.790849935282541, "language_loss": 0.78108692, "learning_rate": 2.157966539671533e-06, "loss": 0.80292034, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.5437145233154297 }, { "auxiliary_loss_clip": 0.01139825, "auxiliary_loss_mlp": 0.01029113, "balance_loss_clip": 1.05032432, "balance_loss_mlp": 1.022053, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 1.8825246193916172, "language_loss": 0.67319083, "learning_rate": 2.157189989615021e-06, "loss": 0.69488025, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.5717525482177734 }, { "auxiliary_loss_clip": 0.01163903, "auxiliary_loss_mlp": 0.00762225, "balance_loss_clip": 1.05011427, "balance_loss_mlp": 1.00015759, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 2.9090867705678547, "language_loss": 0.75403357, "learning_rate": 2.156413415712913e-06, "loss": 0.77329481, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 2.5772855281829834 }, { "auxiliary_loss_clip": 0.01153938, "auxiliary_loss_mlp": 0.00761434, "balance_loss_clip": 1.05221987, "balance_loss_mlp": 1.00016856, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 1.6183906080018549, "language_loss": 0.78697866, "learning_rate": 2.155636818083014e-06, "loss": 0.80613238, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.588271379470825 }, { "auxiliary_loss_clip": 0.01146856, "auxiliary_loss_mlp": 0.01032617, "balance_loss_clip": 1.05075681, "balance_loss_mlp": 1.02532744, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 1.8048707165874864, "language_loss": 0.8396436, "learning_rate": 2.154860196843134e-06, "loss": 0.86143839, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.597961664199829 }, { "auxiliary_loss_clip": 0.01177569, "auxiliary_loss_mlp": 0.01028547, "balance_loss_clip": 1.05234957, "balance_loss_mlp": 1.0207386, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 2.056942888142575, "language_loss": 0.76894057, "learning_rate": 2.154083552111085e-06, "loss": 0.7910018, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.51446795463562 }, { "auxiliary_loss_clip": 0.01178908, "auxiliary_loss_mlp": 0.01027775, "balance_loss_clip": 1.05178881, "balance_loss_mlp": 1.01985383, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 1.6273680593012725, "language_loss": 0.81904781, "learning_rate": 2.1533068840046834e-06, "loss": 0.84111464, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.578711748123169 }, { "auxiliary_loss_clip": 0.01145908, "auxiliary_loss_mlp": 0.0076171, "balance_loss_clip": 1.05105031, "balance_loss_mlp": 1.00017917, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 3.237740827907876, "language_loss": 0.61697578, "learning_rate": 2.152530192641749e-06, "loss": 0.63605201, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.5416808128356934 }, { "auxiliary_loss_clip": 0.01165883, "auxiliary_loss_mlp": 0.01026605, "balance_loss_clip": 1.05228579, "balance_loss_mlp": 1.01923156, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 1.8998714739131946, "language_loss": 0.7273941, "learning_rate": 2.1517534781401068e-06, "loss": 0.74931896, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.5542490482330322 }, { "auxiliary_loss_clip": 0.01163153, "auxiliary_loss_mlp": 0.01028, "balance_loss_clip": 1.05113935, "balance_loss_mlp": 1.0206027, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 2.151768297455284, "language_loss": 0.69418269, "learning_rate": 2.150976740617581e-06, "loss": 0.71609426, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 3.2695202827453613 }, { "auxiliary_loss_clip": 0.01156421, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.0532012, "balance_loss_mlp": 1.02450514, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 1.9997206100011011, "language_loss": 0.71126676, "learning_rate": 2.150199980192006e-06, "loss": 0.73315358, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.5988078117370605 }, { "auxiliary_loss_clip": 0.01142877, "auxiliary_loss_mlp": 0.01026548, "balance_loss_clip": 1.04854751, "balance_loss_mlp": 1.01868653, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 1.5863816777045285, "language_loss": 0.80836058, "learning_rate": 2.1494231969812114e-06, "loss": 0.83005488, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 2.5230631828308105 }, { "auxiliary_loss_clip": 0.01140259, "auxiliary_loss_mlp": 0.01032069, "balance_loss_clip": 1.05032337, "balance_loss_mlp": 1.02472615, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 1.982135663228463, "language_loss": 0.80597854, "learning_rate": 2.1486463911030372e-06, "loss": 0.82770187, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.6100234985351562 }, { "auxiliary_loss_clip": 0.01147312, "auxiliary_loss_mlp": 0.01026658, "balance_loss_clip": 1.04796052, "balance_loss_mlp": 1.01917171, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 1.7634554366553603, "language_loss": 0.74462116, "learning_rate": 2.147869562675324e-06, "loss": 0.76636088, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 3.3397929668426514 }, { "auxiliary_loss_clip": 0.01166813, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.05411267, "balance_loss_mlp": 1.02794313, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 1.6509237990113101, "language_loss": 0.72252804, "learning_rate": 2.147092711815915e-06, "loss": 0.74456453, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 2.569129467010498 }, { "auxiliary_loss_clip": 0.01135893, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.05151331, "balance_loss_mlp": 1.02325535, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 2.78757949260212, "language_loss": 0.8650744, "learning_rate": 2.1463158386426593e-06, "loss": 0.8867439, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 3.2723093032836914 }, { "auxiliary_loss_clip": 0.01157327, "auxiliary_loss_mlp": 0.01027577, "balance_loss_clip": 1.05343342, "balance_loss_mlp": 1.01941156, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 2.6044202747462513, "language_loss": 0.77162236, "learning_rate": 2.145538943273407e-06, "loss": 0.7934714, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 3.4488463401794434 }, { "auxiliary_loss_clip": 0.01179348, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.05343246, "balance_loss_mlp": 1.02437329, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 2.177459688660364, "language_loss": 0.71779943, "learning_rate": 2.144762025826013e-06, "loss": 0.7399168, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 2.49027943611145 }, { "auxiliary_loss_clip": 0.01168669, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.05326521, "balance_loss_mlp": 1.02355874, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 2.918457134827686, "language_loss": 0.87204623, "learning_rate": 2.143985086418334e-06, "loss": 0.89405107, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.5130274295806885 }, { "auxiliary_loss_clip": 0.01152885, "auxiliary_loss_mlp": 0.01025323, "balance_loss_clip": 1.05128205, "balance_loss_mlp": 1.01814044, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.3349778110086583, "language_loss": 0.7657333, "learning_rate": 2.1432081251682324e-06, "loss": 0.7875154, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.5365042686462402 }, { "auxiliary_loss_clip": 0.01164189, "auxiliary_loss_mlp": 0.01027312, "balance_loss_clip": 1.05451417, "balance_loss_mlp": 1.01922369, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 1.7018370934955276, "language_loss": 0.86880267, "learning_rate": 2.142431142193572e-06, "loss": 0.89071763, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 2.497908353805542 }, { "auxiliary_loss_clip": 0.0117803, "auxiliary_loss_mlp": 0.0103358, "balance_loss_clip": 1.05447102, "balance_loss_mlp": 1.02643013, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.206713364879448, "language_loss": 0.71828365, "learning_rate": 2.1416541376122207e-06, "loss": 0.74039972, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.6191036701202393 }, { "auxiliary_loss_clip": 0.011789, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 1.05333352, "balance_loss_mlp": 1.02393055, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 1.6721328351008704, "language_loss": 0.72892427, "learning_rate": 2.1408771115420496e-06, "loss": 0.75103354, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.5364179611206055 }, { "auxiliary_loss_clip": 0.01128346, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.05378699, "balance_loss_mlp": 1.02552164, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 1.9374610797091183, "language_loss": 0.6483233, "learning_rate": 2.140100064100932e-06, "loss": 0.66993707, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.589509963989258 }, { "auxiliary_loss_clip": 0.01162108, "auxiliary_loss_mlp": 0.01029301, "balance_loss_clip": 1.0524627, "balance_loss_mlp": 1.0214808, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 2.0203283255523456, "language_loss": 0.75762922, "learning_rate": 2.139322995406746e-06, "loss": 0.77954328, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 2.462756633758545 }, { "auxiliary_loss_clip": 0.01182066, "auxiliary_loss_mlp": 0.01037552, "balance_loss_clip": 1.05649364, "balance_loss_mlp": 1.02942824, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 2.424513943100918, "language_loss": 0.79401016, "learning_rate": 2.1385459055773727e-06, "loss": 0.81620646, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.492323398590088 }, { "auxiliary_loss_clip": 0.01109823, "auxiliary_loss_mlp": 0.00760932, "balance_loss_clip": 1.04347062, "balance_loss_mlp": 1.00018728, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 2.075186886873627, "language_loss": 0.7409814, "learning_rate": 2.137768794730696e-06, "loss": 0.75968897, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 2.998983383178711 }, { "auxiliary_loss_clip": 0.01157358, "auxiliary_loss_mlp": 0.01035041, "balance_loss_clip": 1.05429769, "balance_loss_mlp": 1.02685118, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 1.748475360599871, "language_loss": 0.80381751, "learning_rate": 2.1369916629846026e-06, "loss": 0.82574147, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.5438485145568848 }, { "auxiliary_loss_clip": 0.01151975, "auxiliary_loss_mlp": 0.01027265, "balance_loss_clip": 1.05047667, "balance_loss_mlp": 1.01983571, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 3.0102353892912883, "language_loss": 0.74903703, "learning_rate": 2.136214510456983e-06, "loss": 0.77082944, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.499143123626709 }, { "auxiliary_loss_clip": 0.01035946, "auxiliary_loss_mlp": 0.00751502, "balance_loss_clip": 1.01938808, "balance_loss_mlp": 1.00006902, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.902609334423674, "language_loss": 0.63166678, "learning_rate": 2.1354373372657296e-06, "loss": 0.64954126, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 3.204637050628662 }, { "auxiliary_loss_clip": 0.01178091, "auxiliary_loss_mlp": 0.0103436, "balance_loss_clip": 1.05567837, "balance_loss_mlp": 1.02618814, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 1.4524144752389598, "language_loss": 0.71105587, "learning_rate": 2.1346601435287404e-06, "loss": 0.73318046, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.4971580505371094 }, { "auxiliary_loss_clip": 0.01147599, "auxiliary_loss_mlp": 0.01037003, "balance_loss_clip": 1.04760861, "balance_loss_mlp": 1.02963555, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 1.7133734042410407, "language_loss": 0.80106771, "learning_rate": 2.1338829293639144e-06, "loss": 0.82291371, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.599439859390259 }, { "auxiliary_loss_clip": 0.01118812, "auxiliary_loss_mlp": 0.0103087, "balance_loss_clip": 1.04597521, "balance_loss_mlp": 1.02329993, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 2.5259661442667674, "language_loss": 0.83020836, "learning_rate": 2.1331056948891547e-06, "loss": 0.85170519, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.5853421688079834 }, { "auxiliary_loss_clip": 0.01147277, "auxiliary_loss_mlp": 0.01025498, "balance_loss_clip": 1.05090284, "balance_loss_mlp": 1.01732659, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 2.1411361041668733, "language_loss": 0.7600106, "learning_rate": 2.1323284402223666e-06, "loss": 0.7817384, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.508910655975342 }, { "auxiliary_loss_clip": 0.0117679, "auxiliary_loss_mlp": 0.00760224, "balance_loss_clip": 1.05670714, "balance_loss_mlp": 1.00017619, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.7994431446400758, "language_loss": 0.88147527, "learning_rate": 2.1315511654814597e-06, "loss": 0.90084541, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.508526563644409 }, { "auxiliary_loss_clip": 0.01145899, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.05214357, "balance_loss_mlp": 1.02542126, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 1.7532034367466658, "language_loss": 0.78349084, "learning_rate": 2.1307738707843456e-06, "loss": 0.80527073, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 3.3329477310180664 }, { "auxiliary_loss_clip": 0.01171405, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.05506158, "balance_loss_mlp": 1.03023577, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 1.945609180396013, "language_loss": 0.6934334, "learning_rate": 2.1299965562489385e-06, "loss": 0.71553689, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.540602922439575 }, { "auxiliary_loss_clip": 0.01161585, "auxiliary_loss_mlp": 0.01032441, "balance_loss_clip": 1.05047154, "balance_loss_mlp": 1.02516675, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 1.3258579220085758, "language_loss": 0.78690434, "learning_rate": 2.129219221993158e-06, "loss": 0.80884463, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.5905632972717285 }, { "auxiliary_loss_clip": 0.01040612, "auxiliary_loss_mlp": 0.01004339, "balance_loss_clip": 1.01929367, "balance_loss_mlp": 1.00320625, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.7896809960680853, "language_loss": 0.59993327, "learning_rate": 2.128441868134924e-06, "loss": 0.62038279, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.188976764678955 }, { "auxiliary_loss_clip": 0.01137411, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.04847789, "balance_loss_mlp": 1.0239048, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.252894202480421, "language_loss": 0.82677627, "learning_rate": 2.1276644947921606e-06, "loss": 0.84846354, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 3.3306000232696533 }, { "auxiliary_loss_clip": 0.01161603, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 1.05065155, "balance_loss_mlp": 1.02076101, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 1.7770651860566589, "language_loss": 0.82257318, "learning_rate": 2.126887102082795e-06, "loss": 0.84448028, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 2.487952947616577 }, { "auxiliary_loss_clip": 0.01135261, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.04627776, "balance_loss_mlp": 1.02407742, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 1.729039384768213, "language_loss": 0.7031076, "learning_rate": 2.126109690124757e-06, "loss": 0.72477335, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 3.3127381801605225 }, { "auxiliary_loss_clip": 0.01125476, "auxiliary_loss_mlp": 0.01026951, "balance_loss_clip": 1.04902315, "balance_loss_mlp": 1.01947045, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 1.5732529297067925, "language_loss": 0.71537805, "learning_rate": 2.1253322590359786e-06, "loss": 0.7369023, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.592883348464966 }, { "auxiliary_loss_clip": 0.01163046, "auxiliary_loss_mlp": 0.01034176, "balance_loss_clip": 1.05138791, "balance_loss_mlp": 1.02658844, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 1.575374183826372, "language_loss": 0.74081182, "learning_rate": 2.124554808934397e-06, "loss": 0.762784, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 3.348723888397217 }, { "auxiliary_loss_clip": 0.01113796, "auxiliary_loss_mlp": 0.01031677, "balance_loss_clip": 1.04220986, "balance_loss_mlp": 1.02414298, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 1.7083068582222076, "language_loss": 0.72967529, "learning_rate": 2.1237773399379496e-06, "loss": 0.75112998, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.6150074005126953 }, { "auxiliary_loss_clip": 0.01152756, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.04819787, "balance_loss_mlp": 1.02094579, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 1.657994744657819, "language_loss": 0.87036741, "learning_rate": 2.122999852164578e-06, "loss": 0.89218473, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.556964635848999 }, { "auxiliary_loss_clip": 0.01120369, "auxiliary_loss_mlp": 0.01028896, "balance_loss_clip": 1.04913497, "balance_loss_mlp": 1.02105212, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 2.262996378218739, "language_loss": 0.58849156, "learning_rate": 2.122222345732227e-06, "loss": 0.60998416, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 2.6030972003936768 }, { "auxiliary_loss_clip": 0.01136732, "auxiliary_loss_mlp": 0.01030732, "balance_loss_clip": 1.04809618, "balance_loss_mlp": 1.0228492, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 1.6555735875777922, "language_loss": 0.82764208, "learning_rate": 2.121444820758843e-06, "loss": 0.84931672, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 2.554067373275757 }, { "auxiliary_loss_clip": 0.01119911, "auxiliary_loss_mlp": 0.01032993, "balance_loss_clip": 1.04862988, "balance_loss_mlp": 1.02497625, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 1.967197102579569, "language_loss": 0.78690612, "learning_rate": 2.120667277362376e-06, "loss": 0.8084352, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.594515800476074 }, { "auxiliary_loss_clip": 0.0118236, "auxiliary_loss_mlp": 0.010268, "balance_loss_clip": 1.0568099, "balance_loss_mlp": 1.01864648, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 1.8957120822679856, "language_loss": 0.85178983, "learning_rate": 2.1198897156607796e-06, "loss": 0.8738814, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.4333302974700928 }, { "auxiliary_loss_clip": 0.01167511, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.05245018, "balance_loss_mlp": 1.02526188, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.4210931040895005, "language_loss": 0.73310387, "learning_rate": 2.1191121357720085e-06, "loss": 0.75510925, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 2.5157558917999268 }, { "auxiliary_loss_clip": 0.01115445, "auxiliary_loss_mlp": 0.0103212, "balance_loss_clip": 1.04705024, "balance_loss_mlp": 1.02454412, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.8234506545270768, "language_loss": 0.74377966, "learning_rate": 2.1183345378140206e-06, "loss": 0.76525533, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.6002607345581055 }, { "auxiliary_loss_clip": 0.01061406, "auxiliary_loss_mlp": 0.01010184, "balance_loss_clip": 1.01664901, "balance_loss_mlp": 1.00892639, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8519891515345855, "language_loss": 0.61991847, "learning_rate": 2.1175569219047783e-06, "loss": 0.64063442, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.2120490074157715 }, { "auxiliary_loss_clip": 0.01177488, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.05357111, "balance_loss_mlp": 1.02585125, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 1.9133935785879337, "language_loss": 0.73331118, "learning_rate": 2.1167792881622437e-06, "loss": 0.75541914, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.479883909225464 }, { "auxiliary_loss_clip": 0.0114861, "auxiliary_loss_mlp": 0.01027374, "balance_loss_clip": 1.05444872, "balance_loss_mlp": 1.01962256, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 1.7145511489382614, "language_loss": 0.81107306, "learning_rate": 2.116001636704384e-06, "loss": 0.83283293, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.554598331451416 }, { "auxiliary_loss_clip": 0.0113029, "auxiliary_loss_mlp": 0.01027209, "balance_loss_clip": 1.04912591, "balance_loss_mlp": 1.01917386, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 4.057534061135621, "language_loss": 0.80483896, "learning_rate": 2.1152239676491685e-06, "loss": 0.82641399, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.5975704193115234 }, { "auxiliary_loss_clip": 0.0115528, "auxiliary_loss_mlp": 0.01026097, "balance_loss_clip": 1.0500195, "balance_loss_mlp": 1.01847339, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 1.7468151034275665, "language_loss": 0.73347038, "learning_rate": 2.114446281114569e-06, "loss": 0.75528419, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.5571212768554688 }, { "auxiliary_loss_clip": 0.01143517, "auxiliary_loss_mlp": 0.01042098, "balance_loss_clip": 1.05055523, "balance_loss_mlp": 1.0345397, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 1.9585269528026878, "language_loss": 0.75991654, "learning_rate": 2.1136685772185587e-06, "loss": 0.78177273, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.5242931842803955 }, { "auxiliary_loss_clip": 0.01149596, "auxiliary_loss_mlp": 0.00761956, "balance_loss_clip": 1.04540873, "balance_loss_mlp": 1.00017703, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.5997846234397637, "language_loss": 0.77929437, "learning_rate": 2.1128908560791163e-06, "loss": 0.79840994, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.573556661605835 }, { "auxiliary_loss_clip": 0.01179039, "auxiliary_loss_mlp": 0.0103448, "balance_loss_clip": 1.05534947, "balance_loss_mlp": 1.02626097, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 1.8638726751558847, "language_loss": 0.78572941, "learning_rate": 2.1121131178142203e-06, "loss": 0.80786467, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.4593639373779297 }, { "auxiliary_loss_clip": 0.01148047, "auxiliary_loss_mlp": 0.0102675, "balance_loss_clip": 1.0474416, "balance_loss_mlp": 1.01906705, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.4542402657673086, "language_loss": 0.82282186, "learning_rate": 2.1113353625418544e-06, "loss": 0.84456986, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 2.5740013122558594 }, { "auxiliary_loss_clip": 0.01165494, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.05731058, "balance_loss_mlp": 1.02224171, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.5943686835394164, "language_loss": 0.78957248, "learning_rate": 2.1105575903800017e-06, "loss": 0.81152409, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 3.30733060836792 }, { "auxiliary_loss_clip": 0.01168467, "auxiliary_loss_mlp": 0.01039388, "balance_loss_clip": 1.0530616, "balance_loss_mlp": 1.03160715, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 1.749433591963683, "language_loss": 0.8504833, "learning_rate": 2.1097798014466502e-06, "loss": 0.87256181, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.5389459133148193 }, { "auxiliary_loss_clip": 0.01168126, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.05336189, "balance_loss_mlp": 1.02632356, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 2.4086696858656453, "language_loss": 0.59181774, "learning_rate": 2.109001995859791e-06, "loss": 0.61384249, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.478395700454712 }, { "auxiliary_loss_clip": 0.0104691, "auxiliary_loss_mlp": 0.01002331, "balance_loss_clip": 1.01585317, "balance_loss_mlp": 1.00116909, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.7933570114983841, "language_loss": 0.60061121, "learning_rate": 2.108224173737415e-06, "loss": 0.62110353, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.161641836166382 }, { "auxiliary_loss_clip": 0.01145748, "auxiliary_loss_mlp": 0.01031089, "balance_loss_clip": 1.04768622, "balance_loss_mlp": 1.02308702, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 1.769731746884637, "language_loss": 0.76225603, "learning_rate": 2.1074463351975183e-06, "loss": 0.78402436, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 3.3626086711883545 }, { "auxiliary_loss_clip": 0.01139496, "auxiliary_loss_mlp": 0.01032986, "balance_loss_clip": 1.05050921, "balance_loss_mlp": 1.02559257, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 1.7761163191300797, "language_loss": 0.71566433, "learning_rate": 2.106668480358098e-06, "loss": 0.73738915, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 2.63297963142395 }, { "auxiliary_loss_clip": 0.01144735, "auxiliary_loss_mlp": 0.01024828, "balance_loss_clip": 1.04910302, "balance_loss_mlp": 1.01653135, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 1.7322204005742854, "language_loss": 0.70784891, "learning_rate": 2.105890609337154e-06, "loss": 0.72954452, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 3.311931848526001 }, { "auxiliary_loss_clip": 0.01070423, "auxiliary_loss_mlp": 0.0100047, "balance_loss_clip": 1.01766753, "balance_loss_mlp": 0.99924815, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6892486820123155, "language_loss": 0.63864636, "learning_rate": 2.1051127222526883e-06, "loss": 0.65935528, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.1817545890808105 }, { "auxiliary_loss_clip": 0.01165057, "auxiliary_loss_mlp": 0.01024578, "balance_loss_clip": 1.05593157, "balance_loss_mlp": 1.01727295, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 1.6004983534681403, "language_loss": 0.80643386, "learning_rate": 2.1043348192227067e-06, "loss": 0.82833016, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 3.3491454124450684 }, { "auxiliary_loss_clip": 0.01127805, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.04977751, "balance_loss_mlp": 1.02617192, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.8169041872467049, "language_loss": 0.61740887, "learning_rate": 2.1035569003652156e-06, "loss": 0.6390183, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.575763702392578 }, { "auxiliary_loss_clip": 0.0112155, "auxiliary_loss_mlp": 0.01033038, "balance_loss_clip": 1.04833043, "balance_loss_mlp": 1.02451396, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 1.9879549648538788, "language_loss": 0.81356823, "learning_rate": 2.1027789657982255e-06, "loss": 0.83511412, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.5380499362945557 }, { "auxiliary_loss_clip": 0.01124009, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.05113745, "balance_loss_mlp": 1.01809919, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 1.866633289057137, "language_loss": 0.77052605, "learning_rate": 2.1020010156397482e-06, "loss": 0.79202569, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.60009765625 }, { "auxiliary_loss_clip": 0.01164788, "auxiliary_loss_mlp": 0.01030254, "balance_loss_clip": 1.05229187, "balance_loss_mlp": 1.02248788, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 1.536899164926935, "language_loss": 0.77268744, "learning_rate": 2.101223050007797e-06, "loss": 0.79463792, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 2.5271964073181152 }, { "auxiliary_loss_clip": 0.01069785, "auxiliary_loss_mlp": 0.01000766, "balance_loss_clip": 1.01721942, "balance_loss_mlp": 0.99955654, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8192399197389844, "language_loss": 0.5385918, "learning_rate": 2.1004450690203904e-06, "loss": 0.55929732, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.1213138103485107 }, { "auxiliary_loss_clip": 0.01069197, "auxiliary_loss_mlp": 0.01000537, "balance_loss_clip": 1.01655769, "balance_loss_mlp": 0.99930328, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.8520163787265298, "language_loss": 0.63357306, "learning_rate": 2.099667072795546e-06, "loss": 0.65427035, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.1359729766845703 }, { "auxiliary_loss_clip": 0.01162262, "auxiliary_loss_mlp": 0.01025622, "balance_loss_clip": 1.051368, "balance_loss_mlp": 1.01848745, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 1.8489233767116153, "language_loss": 0.79844034, "learning_rate": 2.0988890614512864e-06, "loss": 0.82031918, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.5088822841644287 }, { "auxiliary_loss_clip": 0.01155043, "auxiliary_loss_mlp": 0.01028185, "balance_loss_clip": 1.05414093, "balance_loss_mlp": 1.02052021, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 1.751985588006529, "language_loss": 0.84343684, "learning_rate": 2.098111035105635e-06, "loss": 0.86526912, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.5491511821746826 }, { "auxiliary_loss_clip": 0.01121976, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.0530138, "balance_loss_mlp": 1.02546096, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 1.6143187047978216, "language_loss": 0.72947335, "learning_rate": 2.0973329938766176e-06, "loss": 0.75102609, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.569042205810547 }, { "auxiliary_loss_clip": 0.01170771, "auxiliary_loss_mlp": 0.01033381, "balance_loss_clip": 1.05442297, "balance_loss_mlp": 1.02498305, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 2.55334245985013, "language_loss": 0.78650689, "learning_rate": 2.0965549378822618e-06, "loss": 0.80854839, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.5390241146087646 }, { "auxiliary_loss_clip": 0.01075835, "auxiliary_loss_mlp": 0.01032195, "balance_loss_clip": 1.04328394, "balance_loss_mlp": 1.02424121, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 1.8919775288650955, "language_loss": 0.83957708, "learning_rate": 2.095776867240599e-06, "loss": 0.86065739, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 2.668036937713623 }, { "auxiliary_loss_clip": 0.01132489, "auxiliary_loss_mlp": 0.01033993, "balance_loss_clip": 1.04832447, "balance_loss_mlp": 1.02636647, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 2.0132264537897973, "language_loss": 0.82610387, "learning_rate": 2.094998782069661e-06, "loss": 0.84776866, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 2.554746627807617 }, { "auxiliary_loss_clip": 0.01176486, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 1.05281913, "balance_loss_mlp": 1.01985216, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 1.743975916764495, "language_loss": 0.75405288, "learning_rate": 2.0942206824874845e-06, "loss": 0.77609807, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.5166373252868652 }, { "auxiliary_loss_clip": 0.0116186, "auxiliary_loss_mlp": 0.01025988, "balance_loss_clip": 1.05199146, "balance_loss_mlp": 1.01800096, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 2.0158587188373773, "language_loss": 0.78963685, "learning_rate": 2.093442568612105e-06, "loss": 0.81151527, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.4764583110809326 }, { "auxiliary_loss_clip": 0.01178324, "auxiliary_loss_mlp": 0.01024323, "balance_loss_clip": 1.05359054, "balance_loss_mlp": 1.01664543, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 1.5289662338513503, "language_loss": 0.85118699, "learning_rate": 2.0926644405615613e-06, "loss": 0.87321347, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.509922504425049 }, { "auxiliary_loss_clip": 0.01133091, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.05059099, "balance_loss_mlp": 1.02536833, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 4.205800256962008, "language_loss": 0.81262183, "learning_rate": 2.091886298453897e-06, "loss": 0.83428025, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.5400145053863525 }, { "auxiliary_loss_clip": 0.01159769, "auxiliary_loss_mlp": 0.01038075, "balance_loss_clip": 1.04971814, "balance_loss_mlp": 1.03066969, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 1.8706031384962958, "language_loss": 0.73170936, "learning_rate": 2.091108142407153e-06, "loss": 0.75368786, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 3.262004852294922 }, { "auxiliary_loss_clip": 0.01045975, "auxiliary_loss_mlp": 0.01006986, "balance_loss_clip": 1.01785231, "balance_loss_mlp": 1.00579441, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8324467400861327, "language_loss": 0.62391603, "learning_rate": 2.090329972539377e-06, "loss": 0.64444566, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.163869619369507 }, { "auxiliary_loss_clip": 0.0107453, "auxiliary_loss_mlp": 0.01020824, "balance_loss_clip": 1.04371798, "balance_loss_mlp": 1.01347506, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 16.11228876097598, "language_loss": 0.68567574, "learning_rate": 2.089551788968616e-06, "loss": 0.70662934, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 2.7228903770446777 }, { "auxiliary_loss_clip": 0.01067506, "auxiliary_loss_mlp": 0.01003896, "balance_loss_clip": 1.01480865, "balance_loss_mlp": 1.0026921, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.8329862534729657, "language_loss": 0.60787725, "learning_rate": 2.08877359181292e-06, "loss": 0.6285913, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 3.203059196472168 }, { "auxiliary_loss_clip": 0.01137493, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.04540968, "balance_loss_mlp": 1.01876783, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 2.0616467086050387, "language_loss": 0.85589921, "learning_rate": 2.0879953811903396e-06, "loss": 0.87754011, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.5822701454162598 }, { "auxiliary_loss_clip": 0.01164882, "auxiliary_loss_mlp": 0.01027253, "balance_loss_clip": 1.05312705, "balance_loss_mlp": 1.01868761, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 1.7806684351801745, "language_loss": 0.78732824, "learning_rate": 2.08721715721893e-06, "loss": 0.80924964, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 3.353858232498169 }, { "auxiliary_loss_clip": 0.01162609, "auxiliary_loss_mlp": 0.01028919, "balance_loss_clip": 1.05244851, "balance_loss_mlp": 1.02115297, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 1.9706806156744732, "language_loss": 0.76989758, "learning_rate": 2.0864389200167477e-06, "loss": 0.7918129, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 2.5338642597198486 }, { "auxiliary_loss_clip": 0.01167983, "auxiliary_loss_mlp": 0.00760892, "balance_loss_clip": 1.05456293, "balance_loss_mlp": 1.00017202, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 1.8617837910706252, "language_loss": 0.79108065, "learning_rate": 2.0856606697018504e-06, "loss": 0.81036937, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.5470848083496094 }, { "auxiliary_loss_clip": 0.01145988, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.04871273, "balance_loss_mlp": 1.02656507, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 2.411441451079812, "language_loss": 0.73592293, "learning_rate": 2.084882406392297e-06, "loss": 0.75773036, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 4.00055456161499 }, { "auxiliary_loss_clip": 0.01163824, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.05199301, "balance_loss_mlp": 1.02073693, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 2.9864552961503423, "language_loss": 0.70971072, "learning_rate": 2.0841041302061496e-06, "loss": 0.73163283, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 3.3809430599212646 }, { "auxiliary_loss_clip": 0.0113667, "auxiliary_loss_mlp": 0.01026583, "balance_loss_clip": 1.04608202, "balance_loss_mlp": 1.01864111, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 1.73420903101188, "language_loss": 0.75613773, "learning_rate": 2.083325841261473e-06, "loss": 0.77777028, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.5321435928344727 }, { "auxiliary_loss_clip": 0.01142154, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.04664278, "balance_loss_mlp": 1.02505958, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 2.0346113726414394, "language_loss": 0.66083282, "learning_rate": 2.0825475396763322e-06, "loss": 0.68258011, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.5607407093048096 }, { "auxiliary_loss_clip": 0.01070165, "auxiliary_loss_mlp": 0.01033874, "balance_loss_clip": 1.04065919, "balance_loss_mlp": 1.02608347, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 1.3726682677620918, "language_loss": 0.65577304, "learning_rate": 2.081769225568796e-06, "loss": 0.67681342, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 2.8261711597442627 }, { "auxiliary_loss_clip": 0.01166424, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.05168939, "balance_loss_mlp": 1.0213958, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.411452304758719, "language_loss": 0.76094055, "learning_rate": 2.0809908990569327e-06, "loss": 0.78289342, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 2.540700674057007 }, { "auxiliary_loss_clip": 0.01147171, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 1.04948068, "balance_loss_mlp": 1.01996565, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.6776774641586172, "language_loss": 0.79082036, "learning_rate": 2.0802125602588146e-06, "loss": 0.81256568, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 2.5347554683685303 }, { "auxiliary_loss_clip": 0.01177463, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.05333328, "balance_loss_mlp": 1.02363241, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 1.8901339210564587, "language_loss": 0.66836363, "learning_rate": 2.0794342092925146e-06, "loss": 0.6904521, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.5426881313323975 }, { "auxiliary_loss_clip": 0.01167483, "auxiliary_loss_mlp": 0.0103927, "balance_loss_clip": 1.05474925, "balance_loss_mlp": 1.03202558, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 1.932340151779029, "language_loss": 0.68094885, "learning_rate": 2.078655846276108e-06, "loss": 0.7030164, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.5261037349700928 }, { "auxiliary_loss_clip": 0.0114584, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.04974163, "balance_loss_mlp": 1.02323878, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 2.0505844719069826, "language_loss": 0.68900019, "learning_rate": 2.0778774713276727e-06, "loss": 0.71077096, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.5530202388763428 }, { "auxiliary_loss_clip": 0.01159387, "auxiliary_loss_mlp": 0.01032715, "balance_loss_clip": 1.04730916, "balance_loss_mlp": 1.02454281, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 1.9509235515434742, "language_loss": 0.67494833, "learning_rate": 2.077099084565287e-06, "loss": 0.69686937, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.4468019008636475 }, { "auxiliary_loss_clip": 0.01141259, "auxiliary_loss_mlp": 0.0102971, "balance_loss_clip": 1.04552436, "balance_loss_mlp": 1.02204478, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 2.1553172988686926, "language_loss": 0.65419692, "learning_rate": 2.0763206861070313e-06, "loss": 0.6759066, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.5473978519439697 }, { "auxiliary_loss_clip": 0.01178872, "auxiliary_loss_mlp": 0.0102991, "balance_loss_clip": 1.05412555, "balance_loss_mlp": 1.02271605, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 1.9242689369314694, "language_loss": 0.75485933, "learning_rate": 2.0755422760709876e-06, "loss": 0.77694714, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 2.420877456665039 }, { "auxiliary_loss_clip": 0.01113528, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.04585934, "balance_loss_mlp": 1.023417, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 1.9830960000054427, "language_loss": 0.76889509, "learning_rate": 2.0747638545752417e-06, "loss": 0.79034597, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.578738212585449 }, { "auxiliary_loss_clip": 0.01146718, "auxiliary_loss_mlp": 0.01029116, "balance_loss_clip": 1.0529995, "balance_loss_mlp": 1.02124166, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 1.9440099360491072, "language_loss": 0.83215129, "learning_rate": 2.073985421737878e-06, "loss": 0.85390961, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.5155348777770996 }, { "auxiliary_loss_clip": 0.01167038, "auxiliary_loss_mlp": 0.01024971, "balance_loss_clip": 1.05470324, "balance_loss_mlp": 1.01726389, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 2.4746969504253884, "language_loss": 0.74212736, "learning_rate": 2.0732069776769844e-06, "loss": 0.76404744, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.5896644592285156 }, { "auxiliary_loss_clip": 0.0117994, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.055933, "balance_loss_mlp": 1.02317238, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 2.985659233832435, "language_loss": 0.73377407, "learning_rate": 2.072428522510651e-06, "loss": 0.7558831, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.4469399452209473 }, { "auxiliary_loss_clip": 0.01127845, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.04685116, "balance_loss_mlp": 1.02476239, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 2.1850802944588024, "language_loss": 0.76329792, "learning_rate": 2.071650056356968e-06, "loss": 0.78490233, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.55739688873291 }, { "auxiliary_loss_clip": 0.01178018, "auxiliary_loss_mlp": 0.01032061, "balance_loss_clip": 1.05429316, "balance_loss_mlp": 1.02379405, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 1.9055121657039031, "language_loss": 0.79658091, "learning_rate": 2.070871579334028e-06, "loss": 0.81868172, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 3.2054247856140137 }, { "auxiliary_loss_clip": 0.01177887, "auxiliary_loss_mlp": 0.01026652, "balance_loss_clip": 1.05453849, "balance_loss_mlp": 1.01885605, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 1.8798911773675568, "language_loss": 0.7166698, "learning_rate": 2.0700930915599264e-06, "loss": 0.73871517, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.490858793258667 }, { "auxiliary_loss_clip": 0.01177327, "auxiliary_loss_mlp": 0.01030954, "balance_loss_clip": 1.05407524, "balance_loss_mlp": 1.02321136, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 1.980026942077062, "language_loss": 0.78054607, "learning_rate": 2.0693145931527583e-06, "loss": 0.80262893, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 2.4330241680145264 }, { "auxiliary_loss_clip": 0.01143162, "auxiliary_loss_mlp": 0.01026054, "balance_loss_clip": 1.04919732, "balance_loss_mlp": 1.01858592, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.6372760965022268, "language_loss": 0.78126812, "learning_rate": 2.068536084230622e-06, "loss": 0.80296028, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.5685012340545654 }, { "auxiliary_loss_clip": 0.0116416, "auxiliary_loss_mlp": 0.01030857, "balance_loss_clip": 1.05200505, "balance_loss_mlp": 1.02332282, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 1.91953398232203, "language_loss": 0.88615626, "learning_rate": 2.067757564911616e-06, "loss": 0.90810645, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.51774001121521 }, { "auxiliary_loss_clip": 0.01155845, "auxiliary_loss_mlp": 0.00761585, "balance_loss_clip": 1.05011117, "balance_loss_mlp": 1.00019908, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 1.8569619877672905, "language_loss": 0.92448008, "learning_rate": 2.0669790353138407e-06, "loss": 0.9436543, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 3.5277116298675537 }, { "auxiliary_loss_clip": 0.01132486, "auxiliary_loss_mlp": 0.00761489, "balance_loss_clip": 1.05234206, "balance_loss_mlp": 1.00017345, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 2.2883263087813543, "language_loss": 0.72756457, "learning_rate": 2.0662004955553995e-06, "loss": 0.74650431, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 2.577946186065674 }, { "auxiliary_loss_clip": 0.01144672, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 1.04984498, "balance_loss_mlp": 1.02067614, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 2.242887396870967, "language_loss": 0.76986921, "learning_rate": 2.065421945754395e-06, "loss": 0.79159439, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 3.5041067600250244 }, { "auxiliary_loss_clip": 0.01124371, "auxiliary_loss_mlp": 0.01034844, "balance_loss_clip": 1.05040789, "balance_loss_mlp": 1.02784395, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.581844721386092, "language_loss": 0.78115481, "learning_rate": 2.0646433860289344e-06, "loss": 0.80274695, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.7079176902770996 }, { "auxiliary_loss_clip": 0.01167235, "auxiliary_loss_mlp": 0.00761784, "balance_loss_clip": 1.05257678, "balance_loss_mlp": 1.00021267, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 1.898663818054111, "language_loss": 0.82666916, "learning_rate": 2.0638648164971233e-06, "loss": 0.84595931, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 3.3593008518218994 }, { "auxiliary_loss_clip": 0.01146937, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.05138469, "balance_loss_mlp": 1.02278876, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 1.7610956015174006, "language_loss": 0.88204002, "learning_rate": 2.06308623727707e-06, "loss": 0.90381002, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.5143041610717773 }, { "auxiliary_loss_clip": 0.01158006, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.05105853, "balance_loss_mlp": 1.0223012, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 3.414404056917229, "language_loss": 0.76468009, "learning_rate": 2.0623076484868846e-06, "loss": 0.78656077, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.4805896282196045 }, { "auxiliary_loss_clip": 0.0104732, "auxiliary_loss_mlp": 0.01001328, "balance_loss_clip": 1.01873136, "balance_loss_mlp": 1.00019574, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8355337152611019, "language_loss": 0.60724926, "learning_rate": 2.061529050244679e-06, "loss": 0.62773573, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 3.085674285888672 }, { "auxiliary_loss_clip": 0.01139251, "auxiliary_loss_mlp": 0.01028543, "balance_loss_clip": 1.04781306, "balance_loss_mlp": 1.02096403, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 1.8714808254221986, "language_loss": 0.74201095, "learning_rate": 2.060750442668565e-06, "loss": 0.76368892, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.56261944770813 }, { "auxiliary_loss_clip": 0.01166494, "auxiliary_loss_mlp": 0.01031116, "balance_loss_clip": 1.05485272, "balance_loss_mlp": 1.02362943, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.0871415031534095, "language_loss": 0.63285404, "learning_rate": 2.059971825876657e-06, "loss": 0.65483022, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.4507975578308105 }, { "auxiliary_loss_clip": 0.01165913, "auxiliary_loss_mlp": 0.01024603, "balance_loss_clip": 1.05157995, "balance_loss_mlp": 1.01730704, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 1.8202379029697286, "language_loss": 0.76183939, "learning_rate": 2.0591931999870713e-06, "loss": 0.78374457, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 2.4887349605560303 }, { "auxiliary_loss_clip": 0.01054631, "auxiliary_loss_mlp": 0.01001131, "balance_loss_clip": 1.01687884, "balance_loss_mlp": 0.99997443, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8154018563046301, "language_loss": 0.57601523, "learning_rate": 2.0584145651179234e-06, "loss": 0.59657288, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.111002206802368 }, { "auxiliary_loss_clip": 0.01150409, "auxiliary_loss_mlp": 0.00761132, "balance_loss_clip": 1.05227113, "balance_loss_mlp": 1.000175, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 2.293162442815295, "language_loss": 0.79760212, "learning_rate": 2.0576359213873327e-06, "loss": 0.81671751, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.491255283355713 }, { "auxiliary_loss_clip": 0.01155021, "auxiliary_loss_mlp": 0.01030972, "balance_loss_clip": 1.0485692, "balance_loss_mlp": 1.02280045, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 3.1146726348033598, "language_loss": 0.69539279, "learning_rate": 2.056857268913419e-06, "loss": 0.71725273, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 2.5574803352355957 }, { "auxiliary_loss_clip": 0.01162657, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.05253291, "balance_loss_mlp": 1.02526522, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.4971189791046338, "language_loss": 0.84000343, "learning_rate": 2.056078607814303e-06, "loss": 0.86195827, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.4595162868499756 }, { "auxiliary_loss_clip": 0.01161974, "auxiliary_loss_mlp": 0.01024487, "balance_loss_clip": 1.05292678, "balance_loss_mlp": 1.01710176, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 1.6345426627252317, "language_loss": 0.78312564, "learning_rate": 2.055299938208106e-06, "loss": 0.80499023, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 2.5733213424682617 }, { "auxiliary_loss_clip": 0.01168702, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 1.05311191, "balance_loss_mlp": 1.0290103, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.6112922213156586, "language_loss": 0.86340767, "learning_rate": 2.0545212602129526e-06, "loss": 0.88546348, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.5165650844573975 }, { "auxiliary_loss_clip": 0.01141536, "auxiliary_loss_mlp": 0.01032005, "balance_loss_clip": 1.04848361, "balance_loss_mlp": 1.02408361, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 1.9953579905253076, "language_loss": 0.65967679, "learning_rate": 2.0537425739469673e-06, "loss": 0.68141222, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.5255584716796875 }, { "auxiliary_loss_clip": 0.01062264, "auxiliary_loss_mlp": 0.01003015, "balance_loss_clip": 1.01865435, "balance_loss_mlp": 1.0017271, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.8400665669410669, "language_loss": 0.59557259, "learning_rate": 2.052963879528276e-06, "loss": 0.61622536, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.067805767059326 }, { "auxiliary_loss_clip": 0.01164331, "auxiliary_loss_mlp": 0.01028943, "balance_loss_clip": 1.05378652, "balance_loss_mlp": 1.0215131, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 2.0978684890715185, "language_loss": 0.76611459, "learning_rate": 2.052185177075007e-06, "loss": 0.78804737, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.56473708152771 }, { "auxiliary_loss_clip": 0.01165612, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.05159724, "balance_loss_mlp": 1.02119756, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 2.0021178232316807, "language_loss": 0.82545358, "learning_rate": 2.051406466705288e-06, "loss": 0.8474015, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.530457019805908 }, { "auxiliary_loss_clip": 0.01176707, "auxiliary_loss_mlp": 0.01029921, "balance_loss_clip": 1.05155206, "balance_loss_mlp": 1.02279198, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 2.86171989589709, "language_loss": 0.81645536, "learning_rate": 2.0506277485372486e-06, "loss": 0.8385216, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 3.2556400299072266 }, { "auxiliary_loss_clip": 0.0115695, "auxiliary_loss_mlp": 0.01026047, "balance_loss_clip": 1.05057788, "balance_loss_mlp": 1.01767874, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 1.7778781648639799, "language_loss": 0.66863912, "learning_rate": 2.04984902268902e-06, "loss": 0.69046909, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 2.4564056396484375 }, { "auxiliary_loss_clip": 0.01172983, "auxiliary_loss_mlp": 0.01031169, "balance_loss_clip": 1.05459666, "balance_loss_mlp": 1.0229528, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 2.506483656681027, "language_loss": 0.75785333, "learning_rate": 2.0490702892787345e-06, "loss": 0.77989483, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 2.5158536434173584 }, { "auxiliary_loss_clip": 0.01153525, "auxiliary_loss_mlp": 0.01029406, "balance_loss_clip": 1.04746556, "balance_loss_mlp": 1.02189565, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 1.7437900176018322, "language_loss": 0.62417096, "learning_rate": 2.0482915484245246e-06, "loss": 0.64600027, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.5436689853668213 }, { "auxiliary_loss_clip": 0.01113281, "auxiliary_loss_mlp": 0.01035679, "balance_loss_clip": 1.04616141, "balance_loss_mlp": 1.02775705, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 2.3523043350310457, "language_loss": 0.84171426, "learning_rate": 2.047512800244526e-06, "loss": 0.86320382, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.561302900314331 }, { "auxiliary_loss_clip": 0.01164682, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 1.05386817, "balance_loss_mlp": 1.02110004, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 1.8421534975061489, "language_loss": 0.78822172, "learning_rate": 2.046734044856873e-06, "loss": 0.81015658, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 3.2972469329833984 }, { "auxiliary_loss_clip": 0.01167099, "auxiliary_loss_mlp": 0.01026357, "balance_loss_clip": 1.05560184, "balance_loss_mlp": 1.01896, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 2.0372088377184627, "language_loss": 0.81090522, "learning_rate": 2.045955282379702e-06, "loss": 0.83283979, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 2.5045578479766846 }, { "auxiliary_loss_clip": 0.01161316, "auxiliary_loss_mlp": 0.01034365, "balance_loss_clip": 1.04988265, "balance_loss_mlp": 1.02654791, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 2.473930552007521, "language_loss": 0.7569347, "learning_rate": 2.045176512931152e-06, "loss": 0.7788915, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 3.4761996269226074 }, { "auxiliary_loss_clip": 0.011372, "auxiliary_loss_mlp": 0.01024587, "balance_loss_clip": 1.04983008, "balance_loss_mlp": 1.01657045, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 1.893992660779313, "language_loss": 0.75820231, "learning_rate": 2.0443977366293604e-06, "loss": 0.77982014, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 3.448227643966675 }, { "auxiliary_loss_clip": 0.01107081, "auxiliary_loss_mlp": 0.01025583, "balance_loss_clip": 1.04487848, "balance_loss_mlp": 1.01736355, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 1.584140773506295, "language_loss": 0.77131808, "learning_rate": 2.043618953592468e-06, "loss": 0.7926448, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 2.7680504322052 }, { "auxiliary_loss_clip": 0.01151506, "auxiliary_loss_mlp": 0.01038454, "balance_loss_clip": 1.0524646, "balance_loss_mlp": 1.03036571, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 2.235483727179507, "language_loss": 0.81278104, "learning_rate": 2.0428401639386144e-06, "loss": 0.83468068, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.8543386459350586 }, { "auxiliary_loss_clip": 0.01043192, "auxiliary_loss_mlp": 0.00999851, "balance_loss_clip": 1.01409721, "balance_loss_mlp": 0.9987008, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8260345526442083, "language_loss": 0.58249795, "learning_rate": 2.042061367785943e-06, "loss": 0.6029284, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.0936594009399414 }, { "auxiliary_loss_clip": 0.01135906, "auxiliary_loss_mlp": 0.01031493, "balance_loss_clip": 1.04609931, "balance_loss_mlp": 1.02375674, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 2.0048920953621163, "language_loss": 0.75004053, "learning_rate": 2.041282565252594e-06, "loss": 0.77171457, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 2.747380256652832 }, { "auxiliary_loss_clip": 0.01130465, "auxiliary_loss_mlp": 0.01027626, "balance_loss_clip": 1.04377162, "balance_loss_mlp": 1.02036047, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 1.634317689359367, "language_loss": 0.77097267, "learning_rate": 2.040503756456714e-06, "loss": 0.79255354, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.571500301361084 }, { "auxiliary_loss_clip": 0.01154676, "auxiliary_loss_mlp": 0.01035077, "balance_loss_clip": 1.04760027, "balance_loss_mlp": 1.02784085, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 1.8055752356351231, "language_loss": 0.78736484, "learning_rate": 2.0397249415164456e-06, "loss": 0.80926234, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.471558094024658 }, { "auxiliary_loss_clip": 0.01137794, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.04456997, "balance_loss_mlp": 1.0260371, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 1.894749687970946, "language_loss": 0.80238324, "learning_rate": 2.0389461205499354e-06, "loss": 0.82410163, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 2.559316635131836 }, { "auxiliary_loss_clip": 0.01137836, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.04759526, "balance_loss_mlp": 1.02542472, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 2.121975015988607, "language_loss": 0.72800338, "learning_rate": 2.03816729367533e-06, "loss": 0.74970978, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 2.535372495651245 }, { "auxiliary_loss_clip": 0.01149938, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.05074239, "balance_loss_mlp": 1.0247556, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 1.9161355351015, "language_loss": 0.71780252, "learning_rate": 2.0373884610107765e-06, "loss": 0.73963082, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.5335328578948975 }, { "auxiliary_loss_clip": 0.01166432, "auxiliary_loss_mlp": 0.01025482, "balance_loss_clip": 1.05104828, "balance_loss_mlp": 1.01783466, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 2.110168761226361, "language_loss": 0.69410467, "learning_rate": 2.0366096226744225e-06, "loss": 0.7160238, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.453207492828369 }, { "auxiliary_loss_clip": 0.01154961, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.05097771, "balance_loss_mlp": 1.02486944, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 1.6303412867079519, "language_loss": 0.76964748, "learning_rate": 2.035830778784418e-06, "loss": 0.79152441, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 2.525562047958374 }, { "auxiliary_loss_clip": 0.01152983, "auxiliary_loss_mlp": 0.01033823, "balance_loss_clip": 1.05602157, "balance_loss_mlp": 1.02584243, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 1.8525517279484773, "language_loss": 0.79613757, "learning_rate": 2.0350519294589134e-06, "loss": 0.81800562, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.5369467735290527 }, { "auxiliary_loss_clip": 0.0111399, "auxiliary_loss_mlp": 0.01030052, "balance_loss_clip": 1.04315567, "balance_loss_mlp": 1.02143908, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.7301522862318621, "language_loss": 0.82847822, "learning_rate": 2.0342730748160588e-06, "loss": 0.8499186, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.6345903873443604 }, { "auxiliary_loss_clip": 0.01147611, "auxiliary_loss_mlp": 0.01027437, "balance_loss_clip": 1.04890871, "balance_loss_mlp": 1.0197655, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 2.524601991108417, "language_loss": 0.70918834, "learning_rate": 2.033494214974006e-06, "loss": 0.73093879, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.5926387310028076 }, { "auxiliary_loss_clip": 0.01138618, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.04994321, "balance_loss_mlp": 1.02515292, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 1.64939661716861, "language_loss": 0.84049702, "learning_rate": 2.0327153500509067e-06, "loss": 0.86220908, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.5476481914520264 }, { "auxiliary_loss_clip": 0.01149327, "auxiliary_loss_mlp": 0.01026815, "balance_loss_clip": 1.05007505, "balance_loss_mlp": 1.01906633, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 1.9343243634454035, "language_loss": 0.84635299, "learning_rate": 2.031936480164916e-06, "loss": 0.86811441, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.5282375812530518 }, { "auxiliary_loss_clip": 0.01145956, "auxiliary_loss_mlp": 0.010362, "balance_loss_clip": 1.05205619, "balance_loss_mlp": 1.028916, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 19.148422893939586, "language_loss": 0.80279952, "learning_rate": 2.0311576054341857e-06, "loss": 0.82462108, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.5755820274353027 }, { "auxiliary_loss_clip": 0.01177899, "auxiliary_loss_mlp": 0.01023635, "balance_loss_clip": 1.05500984, "balance_loss_mlp": 1.01592207, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 1.7619289755129703, "language_loss": 0.62463969, "learning_rate": 2.0303787259768715e-06, "loss": 0.64665496, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 3.1928088665008545 }, { "auxiliary_loss_clip": 0.01151236, "auxiliary_loss_mlp": 0.0103564, "balance_loss_clip": 1.05264115, "balance_loss_mlp": 1.02756989, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 5.623791405200984, "language_loss": 0.70001751, "learning_rate": 2.0295998419111294e-06, "loss": 0.72188628, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 2.5347659587860107 }, { "auxiliary_loss_clip": 0.01105989, "auxiliary_loss_mlp": 0.01028578, "balance_loss_clip": 1.04291689, "balance_loss_mlp": 1.02051091, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 3.3553847771161256, "language_loss": 0.73540884, "learning_rate": 2.028820953355115e-06, "loss": 0.75675452, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.616319417953491 }, { "auxiliary_loss_clip": 0.01154637, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.04963875, "balance_loss_mlp": 1.02023208, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 1.9120691037856727, "language_loss": 0.78607589, "learning_rate": 2.0280420604269834e-06, "loss": 0.80789888, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 2.538588047027588 }, { "auxiliary_loss_clip": 0.01058061, "auxiliary_loss_mlp": 0.01001748, "balance_loss_clip": 1.01679182, "balance_loss_mlp": 1.00053239, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7076487985884422, "language_loss": 0.5895884, "learning_rate": 2.027263163244895e-06, "loss": 0.61018646, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 4.110201597213745 }, { "auxiliary_loss_clip": 0.01161138, "auxiliary_loss_mlp": 0.01029502, "balance_loss_clip": 1.05235934, "balance_loss_mlp": 1.02197981, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.5232248889729476, "language_loss": 0.74421763, "learning_rate": 2.026484261927005e-06, "loss": 0.76612401, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 2.5455281734466553 }, { "auxiliary_loss_clip": 0.01171682, "auxiliary_loss_mlp": 0.01033018, "balance_loss_clip": 1.05535161, "balance_loss_mlp": 1.02519751, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.3133633147219066, "language_loss": 0.74179596, "learning_rate": 2.025705356591475e-06, "loss": 0.76384294, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 3.2100374698638916 }, { "auxiliary_loss_clip": 0.01034022, "auxiliary_loss_mlp": 0.00751192, "balance_loss_clip": 1.0140388, "balance_loss_mlp": 1.00013685, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.7620301251467885, "language_loss": 0.57974541, "learning_rate": 2.024926447356462e-06, "loss": 0.5975976, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 3.9212234020233154 }, { "auxiliary_loss_clip": 0.01160513, "auxiliary_loss_mlp": 0.01029625, "balance_loss_clip": 1.05122852, "balance_loss_mlp": 1.02154827, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 1.832463590767697, "language_loss": 0.78780591, "learning_rate": 2.024147534340127e-06, "loss": 0.80970722, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.4754793643951416 }, { "auxiliary_loss_clip": 0.01143618, "auxiliary_loss_mlp": 0.01027299, "balance_loss_clip": 1.04650879, "balance_loss_mlp": 1.01939499, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 1.622173986512249, "language_loss": 0.79724145, "learning_rate": 2.02336861766063e-06, "loss": 0.81895065, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 2.5390281677246094 }, { "auxiliary_loss_clip": 0.01170614, "auxiliary_loss_mlp": 0.01027267, "balance_loss_clip": 1.05396771, "balance_loss_mlp": 1.01858222, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 1.8552641160080228, "language_loss": 0.79037726, "learning_rate": 2.0225896974361327e-06, "loss": 0.81235605, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.5272183418273926 }, { "auxiliary_loss_clip": 0.01037173, "auxiliary_loss_mlp": 0.01003101, "balance_loss_clip": 1.01716614, "balance_loss_mlp": 1.00199211, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 1.0568224739637946, "language_loss": 0.59942567, "learning_rate": 2.0218107737847962e-06, "loss": 0.6198284, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.1861486434936523 }, { "auxiliary_loss_clip": 0.01178761, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.05452871, "balance_loss_mlp": 1.01867151, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 2.0071463806249676, "language_loss": 0.74596131, "learning_rate": 2.0210318468247826e-06, "loss": 0.76801145, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 2.4899258613586426 }, { "auxiliary_loss_clip": 0.01145946, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.0475688, "balance_loss_mlp": 1.02091575, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.833633324792196, "language_loss": 0.81499207, "learning_rate": 2.020252916674255e-06, "loss": 0.83673728, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.523074150085449 }, { "auxiliary_loss_clip": 0.0116426, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.05110884, "balance_loss_mlp": 1.02340436, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 3.2766195322447604, "language_loss": 0.81395841, "learning_rate": 2.019473983451375e-06, "loss": 0.83591408, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.4823482036590576 }, { "auxiliary_loss_clip": 0.01139747, "auxiliary_loss_mlp": 0.01034248, "balance_loss_clip": 1.04798651, "balance_loss_mlp": 1.02653563, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 1.718382687595878, "language_loss": 0.71629989, "learning_rate": 2.0186950472743076e-06, "loss": 0.73803985, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.5631825923919678 }, { "auxiliary_loss_clip": 0.01177569, "auxiliary_loss_mlp": 0.01029843, "balance_loss_clip": 1.05253041, "balance_loss_mlp": 1.02259779, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 1.5907694330539792, "language_loss": 0.73875964, "learning_rate": 2.0179161082612162e-06, "loss": 0.7608338, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.4622490406036377 }, { "auxiliary_loss_clip": 0.01142555, "auxiliary_loss_mlp": 0.01029346, "balance_loss_clip": 1.04540813, "balance_loss_mlp": 1.02103686, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 1.9066259043377503, "language_loss": 0.73033667, "learning_rate": 2.017137166530266e-06, "loss": 0.75205564, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 2.518465280532837 }, { "auxiliary_loss_clip": 0.01153596, "auxiliary_loss_mlp": 0.0102991, "balance_loss_clip": 1.05061865, "balance_loss_mlp": 1.02240241, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 2.419968695534637, "language_loss": 0.79922098, "learning_rate": 2.0163582221996213e-06, "loss": 0.82105601, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.526128053665161 }, { "auxiliary_loss_clip": 0.01149544, "auxiliary_loss_mlp": 0.01026539, "balance_loss_clip": 1.05141807, "balance_loss_mlp": 1.01846302, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 1.9274577912212143, "language_loss": 0.68044186, "learning_rate": 2.015579275387446e-06, "loss": 0.70220268, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 2.6817667484283447 }, { "auxiliary_loss_clip": 0.01141072, "auxiliary_loss_mlp": 0.01039116, "balance_loss_clip": 1.0507102, "balance_loss_mlp": 1.03150463, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 1.9466448444659963, "language_loss": 0.68668735, "learning_rate": 2.0148003262119085e-06, "loss": 0.70848924, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.5926387310028076 }, { "auxiliary_loss_clip": 0.01132762, "auxiliary_loss_mlp": 0.01030384, "balance_loss_clip": 1.04861522, "balance_loss_mlp": 1.02202141, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 1.9330508920982399, "language_loss": 0.76764655, "learning_rate": 2.0140213747911728e-06, "loss": 0.78927803, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.534339666366577 }, { "auxiliary_loss_clip": 0.01137208, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.05091655, "balance_loss_mlp": 1.01956451, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 2.2984195678513917, "language_loss": 0.80954206, "learning_rate": 2.013242421243406e-06, "loss": 0.83118796, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.6071014404296875 }, { "auxiliary_loss_clip": 0.01121758, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.04915237, "balance_loss_mlp": 1.02706146, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 1.7084537595462241, "language_loss": 0.79124749, "learning_rate": 2.012463465686774e-06, "loss": 0.81281102, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.6276049613952637 }, { "auxiliary_loss_clip": 0.01021289, "auxiliary_loss_mlp": 0.01004738, "balance_loss_clip": 1.01466119, "balance_loss_mlp": 1.00355172, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 0.7712023109704312, "language_loss": 0.54791576, "learning_rate": 2.0116845082394446e-06, "loss": 0.56817603, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.1926779747009277 }, { "auxiliary_loss_clip": 0.01162787, "auxiliary_loss_mlp": 0.01028965, "balance_loss_clip": 1.04818869, "balance_loss_mlp": 1.02118659, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 1.9862043950702042, "language_loss": 0.78559029, "learning_rate": 2.0109055490195836e-06, "loss": 0.80750775, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.490267038345337 }, { "auxiliary_loss_clip": 0.01106351, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.03785563, "balance_loss_mlp": 1.02353394, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 2.0399401364698275, "language_loss": 0.64504135, "learning_rate": 2.0101265881453605e-06, "loss": 0.66641319, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 3.34428334236145 }, { "auxiliary_loss_clip": 0.01143222, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.05055571, "balance_loss_mlp": 1.0232805, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 2.04778213618343, "language_loss": 0.78104997, "learning_rate": 2.009347625734941e-06, "loss": 0.80278492, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 2.5357444286346436 }, { "auxiliary_loss_clip": 0.01179778, "auxiliary_loss_mlp": 0.01039794, "balance_loss_clip": 1.05521417, "balance_loss_mlp": 1.03196752, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 2.2186477447461908, "language_loss": 0.75172687, "learning_rate": 2.0085686619064954e-06, "loss": 0.77392256, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.4501736164093018 }, { "auxiliary_loss_clip": 0.01166655, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.05247438, "balance_loss_mlp": 1.02787161, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 1.933968326521304, "language_loss": 0.82352865, "learning_rate": 2.00778969677819e-06, "loss": 0.84554887, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.486768960952759 }, { "auxiliary_loss_clip": 0.01147613, "auxiliary_loss_mlp": 0.0102764, "balance_loss_clip": 1.04891896, "balance_loss_mlp": 1.02061296, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 1.823184980667518, "language_loss": 0.63962734, "learning_rate": 2.0070107304681934e-06, "loss": 0.66137987, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 3.3091821670532227 }, { "auxiliary_loss_clip": 0.01135934, "auxiliary_loss_mlp": 0.01023819, "balance_loss_clip": 1.05141521, "balance_loss_mlp": 1.01679432, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 1.798578448272556, "language_loss": 0.77987027, "learning_rate": 2.006231763094675e-06, "loss": 0.8014679, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 2.6731138229370117 }, { "auxiliary_loss_clip": 0.01144445, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.05271268, "balance_loss_mlp": 1.02340508, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 1.8416455320611747, "language_loss": 0.87550735, "learning_rate": 2.0054527947758027e-06, "loss": 0.89726204, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 3.227306604385376 }, { "auxiliary_loss_clip": 0.01055807, "auxiliary_loss_mlp": 0.0100296, "balance_loss_clip": 1.015576, "balance_loss_mlp": 1.00182724, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.7327879555078525, "language_loss": 0.55964744, "learning_rate": 2.004673825629746e-06, "loss": 0.58023512, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.8583853244781494 }, { "auxiliary_loss_clip": 0.01138662, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.04494953, "balance_loss_mlp": 1.02448058, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 1.6022466719555193, "language_loss": 0.72359657, "learning_rate": 2.0038948557746744e-06, "loss": 0.74530685, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.579613208770752 }, { "auxiliary_loss_clip": 0.01161135, "auxiliary_loss_mlp": 0.01030187, "balance_loss_clip": 1.05219913, "balance_loss_mlp": 1.02268279, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 2.1340653561184886, "language_loss": 0.75109047, "learning_rate": 2.0031158853287558e-06, "loss": 0.7730037, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 2.5271520614624023 }, { "auxiliary_loss_clip": 0.01147987, "auxiliary_loss_mlp": 0.01028547, "balance_loss_clip": 1.05183756, "balance_loss_mlp": 1.0212965, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 2.451362351287426, "language_loss": 0.70303655, "learning_rate": 2.0023369144101593e-06, "loss": 0.7248019, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.5385520458221436 }, { "auxiliary_loss_clip": 0.01138382, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.04646277, "balance_loss_mlp": 1.02213025, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 1.5901620818437956, "language_loss": 0.76788986, "learning_rate": 2.0015579431370555e-06, "loss": 0.7895717, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.565688133239746 }, { "auxiliary_loss_clip": 0.01160391, "auxiliary_loss_mlp": 0.01028537, "balance_loss_clip": 1.05197489, "balance_loss_mlp": 1.02172971, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 2.465023786578449, "language_loss": 0.6962899, "learning_rate": 2.000778971627612e-06, "loss": 0.71817917, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 2.5728650093078613 }, { "auxiliary_loss_clip": 0.01138107, "auxiliary_loss_mlp": 0.01032584, "balance_loss_clip": 1.04622602, "balance_loss_mlp": 1.02537203, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 1.6752553638808674, "language_loss": 0.90569514, "learning_rate": 2e-06, "loss": 0.92740208, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 2.500642776489258 }, { "auxiliary_loss_clip": 0.01172812, "auxiliary_loss_mlp": 0.01022405, "balance_loss_clip": 1.05120277, "balance_loss_mlp": 1.01517498, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 1.6942927140333357, "language_loss": 0.85317343, "learning_rate": 1.9992210283723878e-06, "loss": 0.87512565, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.4344708919525146 }, { "auxiliary_loss_clip": 0.01170834, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.05148113, "balance_loss_mlp": 1.0231576, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.7003654974467028, "language_loss": 0.79792345, "learning_rate": 1.9984420568629448e-06, "loss": 0.81993842, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.5750534534454346 }, { "auxiliary_loss_clip": 0.01164562, "auxiliary_loss_mlp": 0.01026743, "balance_loss_clip": 1.05279434, "balance_loss_mlp": 1.01985002, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 2.044031772054289, "language_loss": 0.78132641, "learning_rate": 1.9976630855898405e-06, "loss": 0.80323946, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.48429536819458 }, { "auxiliary_loss_clip": 0.01137803, "auxiliary_loss_mlp": 0.0102639, "balance_loss_clip": 1.04178846, "balance_loss_mlp": 1.01907372, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 2.724509720255688, "language_loss": 0.74418485, "learning_rate": 1.9968841146712445e-06, "loss": 0.76582682, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 2.591066598892212 }, { "auxiliary_loss_clip": 0.01104015, "auxiliary_loss_mlp": 0.00760992, "balance_loss_clip": 1.04432142, "balance_loss_mlp": 1.0002389, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 1.503322573046224, "language_loss": 0.71489048, "learning_rate": 1.996105144225326e-06, "loss": 0.73354059, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.635430335998535 }, { "auxiliary_loss_clip": 0.01156694, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.04894257, "balance_loss_mlp": 1.02557361, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 1.9039009578743982, "language_loss": 0.78859711, "learning_rate": 1.995326174370254e-06, "loss": 0.81049013, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.489499092102051 }, { "auxiliary_loss_clip": 0.01157494, "auxiliary_loss_mlp": 0.00760406, "balance_loss_clip": 1.04804647, "balance_loss_mlp": 1.00020742, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 1.5435974286013598, "language_loss": 0.7285282, "learning_rate": 1.994547205224197e-06, "loss": 0.74770713, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 2.483842134475708 }, { "auxiliary_loss_clip": 0.01141864, "auxiliary_loss_mlp": 0.01027229, "balance_loss_clip": 1.04927921, "balance_loss_mlp": 1.02005291, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 1.9175684838329485, "language_loss": 0.67363322, "learning_rate": 1.993768236905325e-06, "loss": 0.69532406, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.525869846343994 }, { "auxiliary_loss_clip": 0.01141301, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.0459851, "balance_loss_mlp": 1.0197922, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 3.0731533151732795, "language_loss": 0.65719104, "learning_rate": 1.992989269531807e-06, "loss": 0.67887735, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.538604497909546 }, { "auxiliary_loss_clip": 0.01140609, "auxiliary_loss_mlp": 0.01022867, "balance_loss_clip": 1.04499817, "balance_loss_mlp": 1.0155828, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 2.9723148686576883, "language_loss": 0.67778778, "learning_rate": 1.99221030322181e-06, "loss": 0.69942248, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.4929943084716797 }, { "auxiliary_loss_clip": 0.01147764, "auxiliary_loss_mlp": 0.01024946, "balance_loss_clip": 1.04824901, "balance_loss_mlp": 1.01815987, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 1.6434157297672074, "language_loss": 0.80803084, "learning_rate": 1.991431338093505e-06, "loss": 0.82975793, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.5777831077575684 }, { "auxiliary_loss_clip": 0.01146114, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.05198085, "balance_loss_mlp": 1.02578831, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 2.8270037749901387, "language_loss": 0.79189646, "learning_rate": 1.9906523742650587e-06, "loss": 0.81368488, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.5396621227264404 }, { "auxiliary_loss_clip": 0.01172799, "auxiliary_loss_mlp": 0.01023501, "balance_loss_clip": 1.0490067, "balance_loss_mlp": 1.01610959, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 1.9802139888884258, "language_loss": 0.77721947, "learning_rate": 1.9898734118546397e-06, "loss": 0.79918242, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 3.2893972396850586 }, { "auxiliary_loss_clip": 0.01092482, "auxiliary_loss_mlp": 0.01030336, "balance_loss_clip": 1.04241908, "balance_loss_mlp": 1.02294159, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 1.562076816308385, "language_loss": 0.80148435, "learning_rate": 1.989094450980416e-06, "loss": 0.82271254, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 2.7436141967773438 }, { "auxiliary_loss_clip": 0.01158957, "auxiliary_loss_mlp": 0.01023945, "balance_loss_clip": 1.05072916, "balance_loss_mlp": 1.016891, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 1.8129591042748034, "language_loss": 0.76784611, "learning_rate": 1.9883154917605556e-06, "loss": 0.78967512, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 2.966700792312622 }, { "auxiliary_loss_clip": 0.01172466, "auxiliary_loss_mlp": 0.01028643, "balance_loss_clip": 1.05182445, "balance_loss_mlp": 1.02154064, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 1.698789376289148, "language_loss": 0.83196694, "learning_rate": 1.9875365343132262e-06, "loss": 0.85397804, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.484851837158203 }, { "auxiliary_loss_clip": 0.01160496, "auxiliary_loss_mlp": 0.00760209, "balance_loss_clip": 1.05326211, "balance_loss_mlp": 1.00021482, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 2.3987613708300466, "language_loss": 0.85086077, "learning_rate": 1.9867575787565946e-06, "loss": 0.87006783, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 3.2455408573150635 }, { "auxiliary_loss_clip": 0.0116075, "auxiliary_loss_mlp": 0.01032031, "balance_loss_clip": 1.05208707, "balance_loss_mlp": 1.0238297, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 1.9726836595820358, "language_loss": 0.8623296, "learning_rate": 1.9859786252088275e-06, "loss": 0.88425744, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.489762783050537 }, { "auxiliary_loss_clip": 0.01136203, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.04831958, "balance_loss_mlp": 1.02297187, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 2.5937381900647836, "language_loss": 0.6657244, "learning_rate": 1.9851996737880914e-06, "loss": 0.68738848, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 2.5541136264801025 }, { "auxiliary_loss_clip": 0.01166765, "auxiliary_loss_mlp": 0.01027333, "balance_loss_clip": 1.05358434, "balance_loss_mlp": 1.01947117, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 2.4516718615943778, "language_loss": 0.74612677, "learning_rate": 1.9844207246125537e-06, "loss": 0.76806778, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 3.211686849594116 }, { "auxiliary_loss_clip": 0.011426, "auxiliary_loss_mlp": 0.01026196, "balance_loss_clip": 1.0474503, "balance_loss_mlp": 1.01904607, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 1.815214287715659, "language_loss": 0.68193781, "learning_rate": 1.983641777800379e-06, "loss": 0.7036258, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 3.4980385303497314 }, { "auxiliary_loss_clip": 0.01048729, "auxiliary_loss_mlp": 0.01006993, "balance_loss_clip": 1.01463199, "balance_loss_mlp": 1.00587273, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.7416688085340349, "language_loss": 0.5877192, "learning_rate": 1.9828628334697343e-06, "loss": 0.60827649, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 3.2807750701904297 }, { "auxiliary_loss_clip": 0.01049809, "auxiliary_loss_mlp": 0.01005186, "balance_loss_clip": 1.01679778, "balance_loss_mlp": 1.00392866, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.7628883832547182, "language_loss": 0.54724693, "learning_rate": 1.982083891738784e-06, "loss": 0.56779695, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.160919189453125 }, { "auxiliary_loss_clip": 0.01139996, "auxiliary_loss_mlp": 0.01022938, "balance_loss_clip": 1.05084968, "balance_loss_mlp": 1.01572907, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.4370681641474516, "language_loss": 0.82882369, "learning_rate": 1.9813049527256923e-06, "loss": 0.85045302, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.576843500137329 }, { "auxiliary_loss_clip": 0.01130005, "auxiliary_loss_mlp": 0.01028712, "balance_loss_clip": 1.04522121, "balance_loss_mlp": 1.0213685, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 2.3869369702616114, "language_loss": 0.8236984, "learning_rate": 1.9805260165486252e-06, "loss": 0.84528559, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 2.548743724822998 }, { "auxiliary_loss_clip": 0.01159085, "auxiliary_loss_mlp": 0.0102944, "balance_loss_clip": 1.05028069, "balance_loss_mlp": 1.02231109, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 1.8344886278482482, "language_loss": 0.86412328, "learning_rate": 1.9797470833257457e-06, "loss": 0.8860085, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 2.4941225051879883 }, { "auxiliary_loss_clip": 0.01161356, "auxiliary_loss_mlp": 0.01026601, "balance_loss_clip": 1.05311108, "balance_loss_mlp": 1.01927006, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 1.9456945097354985, "language_loss": 0.77518833, "learning_rate": 1.9789681531752177e-06, "loss": 0.79706788, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.4898009300231934 }, { "auxiliary_loss_clip": 0.01117763, "auxiliary_loss_mlp": 0.01025721, "balance_loss_clip": 1.04811454, "balance_loss_mlp": 1.01907218, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 1.526410840470581, "language_loss": 0.72547281, "learning_rate": 1.978189226215204e-06, "loss": 0.74690759, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.6112945079803467 }, { "auxiliary_loss_clip": 0.01173083, "auxiliary_loss_mlp": 0.01028163, "balance_loss_clip": 1.05144417, "balance_loss_mlp": 1.02056897, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 2.1327617536273684, "language_loss": 0.77043498, "learning_rate": 1.9774103025638675e-06, "loss": 0.79244745, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.4537482261657715 }, { "auxiliary_loss_clip": 0.0112392, "auxiliary_loss_mlp": 0.01024466, "balance_loss_clip": 1.05333972, "balance_loss_mlp": 1.01717305, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.5101858690840215, "language_loss": 0.76432234, "learning_rate": 1.9766313823393696e-06, "loss": 0.78580618, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.6433751583099365 }, { "auxiliary_loss_clip": 0.01111494, "auxiliary_loss_mlp": 0.01024891, "balance_loss_clip": 1.04320371, "balance_loss_mlp": 1.01736319, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 1.9201772478816068, "language_loss": 0.69114101, "learning_rate": 1.975852465659873e-06, "loss": 0.71250486, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.5761466026306152 }, { "auxiliary_loss_clip": 0.01162248, "auxiliary_loss_mlp": 0.01028845, "balance_loss_clip": 1.05179429, "balance_loss_mlp": 1.02086973, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 2.5971185341169787, "language_loss": 0.70033312, "learning_rate": 1.9750735526435377e-06, "loss": 0.72224402, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 2.5180904865264893 }, { "auxiliary_loss_clip": 0.01144965, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.05094957, "balance_loss_mlp": 1.02378178, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 2.259954388130587, "language_loss": 0.79730552, "learning_rate": 1.974294643408525e-06, "loss": 0.81906664, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 2.561110496520996 }, { "auxiliary_loss_clip": 0.01161956, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.0490998, "balance_loss_mlp": 1.02073264, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 1.7838056486556368, "language_loss": 0.66797632, "learning_rate": 1.9735157380729947e-06, "loss": 0.68987727, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.542891025543213 }, { "auxiliary_loss_clip": 0.01145044, "auxiliary_loss_mlp": 0.01022218, "balance_loss_clip": 1.04825282, "balance_loss_mlp": 1.01531601, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 3.2412825707357977, "language_loss": 0.83921504, "learning_rate": 1.9727368367551053e-06, "loss": 0.86088765, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.585916519165039 }, { "auxiliary_loss_clip": 0.01133564, "auxiliary_loss_mlp": 0.01025845, "balance_loss_clip": 1.04657555, "balance_loss_mlp": 1.01831746, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 1.8749561341324799, "language_loss": 0.68556756, "learning_rate": 1.9719579395730164e-06, "loss": 0.70716166, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.6010890007019043 }, { "auxiliary_loss_clip": 0.01175793, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.05436301, "balance_loss_mlp": 1.02354276, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 1.9512024821535063, "language_loss": 0.93017763, "learning_rate": 1.9711790466448854e-06, "loss": 0.95225132, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.4232699871063232 }, { "auxiliary_loss_clip": 0.01121318, "auxiliary_loss_mlp": 0.0103391, "balance_loss_clip": 1.04561245, "balance_loss_mlp": 1.02583957, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 1.9815859668866573, "language_loss": 0.71214592, "learning_rate": 1.9704001580888704e-06, "loss": 0.73369813, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.618446111679077 }, { "auxiliary_loss_clip": 0.01140572, "auxiliary_loss_mlp": 0.00760529, "balance_loss_clip": 1.04553103, "balance_loss_mlp": 1.00026608, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 1.7791663164805345, "language_loss": 0.87203765, "learning_rate": 1.9696212740231283e-06, "loss": 0.89104867, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 3.2443859577178955 }, { "auxiliary_loss_clip": 0.011627, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.04720616, "balance_loss_mlp": 1.02347028, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 2.06420142559047, "language_loss": 0.82255387, "learning_rate": 1.9688423945658146e-06, "loss": 0.84449649, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 2.5548367500305176 }, { "auxiliary_loss_clip": 0.01106995, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.03936243, "balance_loss_mlp": 1.02356827, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 2.184852163598746, "language_loss": 0.7212401, "learning_rate": 1.9680635198350845e-06, "loss": 0.74262553, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 2.611891508102417 }, { "auxiliary_loss_clip": 0.01160757, "auxiliary_loss_mlp": 0.01029769, "balance_loss_clip": 1.04888391, "balance_loss_mlp": 1.02135921, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 2.086551594614165, "language_loss": 0.72409129, "learning_rate": 1.967284649949093e-06, "loss": 0.74599653, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.5255751609802246 }, { "auxiliary_loss_clip": 0.01125953, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.04318893, "balance_loss_mlp": 1.02036476, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 3.2213747430279027, "language_loss": 0.72593844, "learning_rate": 1.966505785025994e-06, "loss": 0.74748117, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 3.4943788051605225 }, { "auxiliary_loss_clip": 0.01129897, "auxiliary_loss_mlp": 0.01029594, "balance_loss_clip": 1.04825616, "balance_loss_mlp": 1.02190471, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 1.652744238698553, "language_loss": 0.7614094, "learning_rate": 1.965726925183941e-06, "loss": 0.7830044, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 2.824617624282837 }, { "auxiliary_loss_clip": 0.01177117, "auxiliary_loss_mlp": 0.01032278, "balance_loss_clip": 1.05462515, "balance_loss_mlp": 1.02514899, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 1.691198331155662, "language_loss": 0.84921753, "learning_rate": 1.964948070541087e-06, "loss": 0.87131149, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 2.449265480041504 }, { "auxiliary_loss_clip": 0.01149014, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 1.04669809, "balance_loss_mlp": 1.02057326, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 2.412532269474611, "language_loss": 0.69546652, "learning_rate": 1.9641692212155816e-06, "loss": 0.71723497, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 4.035771608352661 }, { "auxiliary_loss_clip": 0.01117321, "auxiliary_loss_mlp": 0.01033209, "balance_loss_clip": 1.04851675, "balance_loss_mlp": 1.02580285, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 1.7867410301765996, "language_loss": 0.72398216, "learning_rate": 1.9633903773255777e-06, "loss": 0.74548745, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 2.9070353507995605 }, { "auxiliary_loss_clip": 0.01171196, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.05031657, "balance_loss_mlp": 1.01892388, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 1.6932988932973856, "language_loss": 0.74663693, "learning_rate": 1.9626115389892237e-06, "loss": 0.76861548, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.5091192722320557 }, { "auxiliary_loss_clip": 0.01138332, "auxiliary_loss_mlp": 0.01024844, "balance_loss_clip": 1.04996037, "balance_loss_mlp": 1.01762557, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 1.9457100968877312, "language_loss": 0.85235435, "learning_rate": 1.96183270632467e-06, "loss": 0.87398612, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.580357551574707 }, { "auxiliary_loss_clip": 0.01123231, "auxiliary_loss_mlp": 0.00761652, "balance_loss_clip": 1.04533958, "balance_loss_mlp": 1.00020421, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 1.6158249132612017, "language_loss": 0.79357386, "learning_rate": 1.9610538794500644e-06, "loss": 0.81242263, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.5829505920410156 }, { "auxiliary_loss_clip": 0.01035336, "auxiliary_loss_mlp": 0.01001922, "balance_loss_clip": 1.01303053, "balance_loss_mlp": 1.00080705, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.7750401140738005, "language_loss": 0.59450758, "learning_rate": 1.9602750584835542e-06, "loss": 0.6148802, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 3.244784116744995 }, { "auxiliary_loss_clip": 0.01141891, "auxiliary_loss_mlp": 0.01025101, "balance_loss_clip": 1.04608369, "balance_loss_mlp": 1.01795387, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 2.1032570279648586, "language_loss": 0.82714176, "learning_rate": 1.959496243543286e-06, "loss": 0.84881163, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 2.501112222671509 }, { "auxiliary_loss_clip": 0.01169467, "auxiliary_loss_mlp": 0.01038869, "balance_loss_clip": 1.0576998, "balance_loss_mlp": 1.03119802, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 5.267384649767942, "language_loss": 0.7927568, "learning_rate": 1.9587174347474057e-06, "loss": 0.8148402, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.517946481704712 }, { "auxiliary_loss_clip": 0.01104374, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.04257238, "balance_loss_mlp": 1.02295518, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 2.897984696764316, "language_loss": 0.81706464, "learning_rate": 1.9579386322140574e-06, "loss": 0.83841467, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.5837342739105225 }, { "auxiliary_loss_clip": 0.01177382, "auxiliary_loss_mlp": 0.00761355, "balance_loss_clip": 1.05347574, "balance_loss_mlp": 1.00024486, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 1.7238683178322511, "language_loss": 0.80713773, "learning_rate": 1.9571598360613854e-06, "loss": 0.82652509, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 2.5425260066986084 }, { "auxiliary_loss_clip": 0.01132269, "auxiliary_loss_mlp": 0.0103278, "balance_loss_clip": 1.04461873, "balance_loss_mlp": 1.02483499, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 2.237095052034925, "language_loss": 0.6967032, "learning_rate": 1.956381046407532e-06, "loss": 0.71835375, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.5344159603118896 }, { "auxiliary_loss_clip": 0.01128862, "auxiliary_loss_mlp": 0.01024317, "balance_loss_clip": 1.04589725, "balance_loss_mlp": 1.01691985, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 2.2745408874591173, "language_loss": 0.86170483, "learning_rate": 1.9556022633706394e-06, "loss": 0.88323665, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.620112180709839 }, { "auxiliary_loss_clip": 0.01140042, "auxiliary_loss_mlp": 0.01024428, "balance_loss_clip": 1.04697442, "balance_loss_mlp": 1.01685774, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 1.7515197537141873, "language_loss": 0.79680735, "learning_rate": 1.954823487068848e-06, "loss": 0.818452, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.58849835395813 }, { "auxiliary_loss_clip": 0.011613, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.05462301, "balance_loss_mlp": 1.02478909, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 1.613758661386504, "language_loss": 0.80984467, "learning_rate": 1.9540447176202976e-06, "loss": 0.83177596, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 2.574246406555176 }, { "auxiliary_loss_clip": 0.01054609, "auxiliary_loss_mlp": 0.0100149, "balance_loss_clip": 1.01460981, "balance_loss_mlp": 1.00031018, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.9085845522109597, "language_loss": 0.60751379, "learning_rate": 1.9532659551431272e-06, "loss": 0.62807477, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.255690097808838 }, { "auxiliary_loss_clip": 0.01162333, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.05084801, "balance_loss_mlp": 1.02241802, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 1.6197307792173696, "language_loss": 0.67538464, "learning_rate": 1.9524871997554744e-06, "loss": 0.69730616, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 2.866302490234375 }, { "auxiliary_loss_clip": 0.01162189, "auxiliary_loss_mlp": 0.01027231, "balance_loss_clip": 1.05266571, "balance_loss_mlp": 1.01991725, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 2.1186597073799263, "language_loss": 0.8056972, "learning_rate": 1.951708451575475e-06, "loss": 0.82759142, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.459747791290283 }, { "auxiliary_loss_clip": 0.01139216, "auxiliary_loss_mlp": 0.01025429, "balance_loss_clip": 1.04612708, "balance_loss_mlp": 1.01786518, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 2.011513654271153, "language_loss": 0.82454473, "learning_rate": 1.9509297107212657e-06, "loss": 0.84619117, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.540949583053589 }, { "auxiliary_loss_clip": 0.0117392, "auxiliary_loss_mlp": 0.01025493, "balance_loss_clip": 1.05204129, "balance_loss_mlp": 1.0182631, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 1.6236361113761322, "language_loss": 0.79030716, "learning_rate": 1.95015097731098e-06, "loss": 0.81230128, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.4724838733673096 }, { "auxiliary_loss_clip": 0.01175885, "auxiliary_loss_mlp": 0.01025367, "balance_loss_clip": 1.05333185, "balance_loss_mlp": 1.01806557, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 2.082196817356619, "language_loss": 0.81988221, "learning_rate": 1.949372251462751e-06, "loss": 0.84189463, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 3.157294511795044 }, { "auxiliary_loss_clip": 0.01133509, "auxiliary_loss_mlp": 0.0076044, "balance_loss_clip": 1.04779506, "balance_loss_mlp": 1.00024152, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 2.0394488985735437, "language_loss": 0.82965827, "learning_rate": 1.9485935332947124e-06, "loss": 0.84859776, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 2.5740952491760254 }, { "auxiliary_loss_clip": 0.01140629, "auxiliary_loss_mlp": 0.01028169, "balance_loss_clip": 1.04833508, "balance_loss_mlp": 1.0213027, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.5631441943906954, "language_loss": 0.83755541, "learning_rate": 1.947814822924993e-06, "loss": 0.85924333, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 2.5039548873901367 }, { "auxiliary_loss_clip": 0.01176177, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.05493808, "balance_loss_mlp": 1.02038121, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 1.9205349783857655, "language_loss": 0.82680279, "learning_rate": 1.9470361204717236e-06, "loss": 0.84883887, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.501235246658325 }, { "auxiliary_loss_clip": 0.01134154, "auxiliary_loss_mlp": 0.00760982, "balance_loss_clip": 1.04556167, "balance_loss_mlp": 1.0002116, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 1.5978927249794943, "language_loss": 0.80635512, "learning_rate": 1.9462574260530326e-06, "loss": 0.82530648, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 3.3562674522399902 }, { "auxiliary_loss_clip": 0.01150271, "auxiliary_loss_mlp": 0.01021718, "balance_loss_clip": 1.0486815, "balance_loss_mlp": 1.01419556, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 1.7879170412746461, "language_loss": 0.81106943, "learning_rate": 1.9454787397870472e-06, "loss": 0.8327893, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 2.487812042236328 }, { "auxiliary_loss_clip": 0.01092645, "auxiliary_loss_mlp": 0.01026017, "balance_loss_clip": 1.04470205, "balance_loss_mlp": 1.01873624, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 1.946823981156344, "language_loss": 0.71794719, "learning_rate": 1.944700061791894e-06, "loss": 0.73913378, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 2.5891847610473633 }, { "auxiliary_loss_clip": 0.01159527, "auxiliary_loss_mlp": 0.01029187, "balance_loss_clip": 1.05085588, "balance_loss_mlp": 1.02244294, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 2.087160255965262, "language_loss": 0.65698278, "learning_rate": 1.943921392185698e-06, "loss": 0.67886996, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 3.348118543624878 }, { "auxiliary_loss_clip": 0.01145809, "auxiliary_loss_mlp": 0.01026692, "balance_loss_clip": 1.04789948, "balance_loss_mlp": 1.01986098, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.0378390808981814, "language_loss": 0.77018619, "learning_rate": 1.9431427310865814e-06, "loss": 0.79191124, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.5422146320343018 }, { "auxiliary_loss_clip": 0.01114096, "auxiliary_loss_mlp": 0.01029684, "balance_loss_clip": 1.0449661, "balance_loss_mlp": 1.02291024, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 1.6404692184753504, "language_loss": 0.78523147, "learning_rate": 1.942364078612667e-06, "loss": 0.80666935, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.5725324153900146 }, { "auxiliary_loss_clip": 0.01136276, "auxiliary_loss_mlp": 0.01028908, "balance_loss_clip": 1.04681921, "balance_loss_mlp": 1.0215354, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 1.8566216966224571, "language_loss": 0.75223601, "learning_rate": 1.9415854348820765e-06, "loss": 0.77388787, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.597478151321411 }, { "auxiliary_loss_clip": 0.0116421, "auxiliary_loss_mlp": 0.01025698, "balance_loss_clip": 1.05225587, "balance_loss_mlp": 1.01831877, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.7837447715986094, "language_loss": 0.68431997, "learning_rate": 1.940806800012929e-06, "loss": 0.70621914, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 2.50307035446167 }, { "auxiliary_loss_clip": 0.01113817, "auxiliary_loss_mlp": 0.00761515, "balance_loss_clip": 1.04727066, "balance_loss_mlp": 1.00024462, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 2.3060841951438498, "language_loss": 0.63183236, "learning_rate": 1.9400281741233432e-06, "loss": 0.65058565, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.774470090866089 }, { "auxiliary_loss_clip": 0.01031223, "auxiliary_loss_mlp": 0.01002283, "balance_loss_clip": 1.01648259, "balance_loss_mlp": 1.00117993, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.6597339566753475, "language_loss": 0.52570558, "learning_rate": 1.939249557331435e-06, "loss": 0.54604065, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.1993675231933594 }, { "auxiliary_loss_clip": 0.01141556, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.04957438, "balance_loss_mlp": 1.02518678, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 1.9905030258088594, "language_loss": 0.72913051, "learning_rate": 1.938470949755321e-06, "loss": 0.75087339, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 2.620419979095459 }, { "auxiliary_loss_clip": 0.01036174, "auxiliary_loss_mlp": 0.01004584, "balance_loss_clip": 1.01458609, "balance_loss_mlp": 1.00334454, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.8118250140638396, "language_loss": 0.55707443, "learning_rate": 1.937692351513115e-06, "loss": 0.57748204, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.1110446453094482 }, { "auxiliary_loss_clip": 0.01164913, "auxiliary_loss_mlp": 0.01023478, "balance_loss_clip": 1.05045283, "balance_loss_mlp": 1.01557696, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 1.6967105846823882, "language_loss": 0.80362284, "learning_rate": 1.9369137627229297e-06, "loss": 0.82550681, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 2.5113067626953125 }, { "auxiliary_loss_clip": 0.01158774, "auxiliary_loss_mlp": 0.01026807, "balance_loss_clip": 1.05170727, "balance_loss_mlp": 1.01894546, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 1.965114968892247, "language_loss": 0.88237119, "learning_rate": 1.936135183502877e-06, "loss": 0.90422702, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.477025032043457 }, { "auxiliary_loss_clip": 0.01134142, "auxiliary_loss_mlp": 0.01034315, "balance_loss_clip": 1.04695344, "balance_loss_mlp": 1.02637267, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.0786733367334893, "language_loss": 0.80465233, "learning_rate": 1.935356613971066e-06, "loss": 0.82633698, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.576385021209717 }, { "auxiliary_loss_clip": 0.01144672, "auxiliary_loss_mlp": 0.00761221, "balance_loss_clip": 1.04838586, "balance_loss_mlp": 1.0002265, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 1.7116912887748048, "language_loss": 0.76765323, "learning_rate": 1.9345780542456047e-06, "loss": 0.78671217, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.551572561264038 }, { "auxiliary_loss_clip": 0.01150562, "auxiliary_loss_mlp": 0.01031839, "balance_loss_clip": 1.05006611, "balance_loss_mlp": 1.02445138, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 1.9672417099438106, "language_loss": 0.71493721, "learning_rate": 1.9337995044446007e-06, "loss": 0.73676121, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 2.5049521923065186 }, { "auxiliary_loss_clip": 0.01165455, "auxiliary_loss_mlp": 0.01027592, "balance_loss_clip": 1.05131364, "balance_loss_mlp": 1.0201174, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 2.7320689642143923, "language_loss": 0.80467808, "learning_rate": 1.9330209646861596e-06, "loss": 0.82660854, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.481038808822632 }, { "auxiliary_loss_clip": 0.01144789, "auxiliary_loss_mlp": 0.01026782, "balance_loss_clip": 1.0503459, "balance_loss_mlp": 1.01958776, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 1.6060303564818765, "language_loss": 0.77574998, "learning_rate": 1.9322424350883843e-06, "loss": 0.79746568, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.5644166469573975 }, { "auxiliary_loss_clip": 0.01149222, "auxiliary_loss_mlp": 0.01022335, "balance_loss_clip": 1.04956937, "balance_loss_mlp": 1.01508391, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 1.6694470456080068, "language_loss": 0.78758436, "learning_rate": 1.931463915769379e-06, "loss": 0.80929989, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.5521018505096436 }, { "auxiliary_loss_clip": 0.01116411, "auxiliary_loss_mlp": 0.01026436, "balance_loss_clip": 1.04487014, "balance_loss_mlp": 1.01933098, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.1373168395041033, "language_loss": 0.74047267, "learning_rate": 1.930685406847242e-06, "loss": 0.76190114, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.564490556716919 }, { "auxiliary_loss_clip": 0.01142901, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.04792595, "balance_loss_mlp": 1.02143908, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 1.636842009503532, "language_loss": 0.81785357, "learning_rate": 1.9299069084400734e-06, "loss": 0.83956683, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.544419288635254 }, { "auxiliary_loss_clip": 0.01130968, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.04940557, "balance_loss_mlp": 1.02253175, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 2.4569699442096162, "language_loss": 0.69965005, "learning_rate": 1.9291284206659717e-06, "loss": 0.72125936, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 3.4046435356140137 }, { "auxiliary_loss_clip": 0.0117571, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.05367374, "balance_loss_mlp": 1.02387428, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 2.1000201820796813, "language_loss": 0.71374583, "learning_rate": 1.928349943643032e-06, "loss": 0.73581237, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.524514675140381 }, { "auxiliary_loss_clip": 0.0115899, "auxiliary_loss_mlp": 0.01029784, "balance_loss_clip": 1.05470145, "balance_loss_mlp": 1.02239311, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 1.6615569456083212, "language_loss": 0.81820965, "learning_rate": 1.9275714774893493e-06, "loss": 0.84009743, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 2.5265250205993652 }, { "auxiliary_loss_clip": 0.0112082, "auxiliary_loss_mlp": 0.01037741, "balance_loss_clip": 1.04477191, "balance_loss_mlp": 1.02982569, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 2.105748643431938, "language_loss": 0.72955751, "learning_rate": 1.9267930223230154e-06, "loss": 0.7511431, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.5526437759399414 }, { "auxiliary_loss_clip": 0.0114894, "auxiliary_loss_mlp": 0.01028753, "balance_loss_clip": 1.05027032, "balance_loss_mlp": 1.0212934, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 1.9434440504980581, "language_loss": 0.784428, "learning_rate": 1.9260145782621224e-06, "loss": 0.80620492, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 3.2847650051116943 }, { "auxiliary_loss_clip": 0.01144389, "auxiliary_loss_mlp": 0.01032803, "balance_loss_clip": 1.05181515, "balance_loss_mlp": 1.025352, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 1.7751260769894401, "language_loss": 0.87915868, "learning_rate": 1.925236145424758e-06, "loss": 0.90093058, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 2.6108434200286865 }, { "auxiliary_loss_clip": 0.01059582, "auxiliary_loss_mlp": 0.01001626, "balance_loss_clip": 1.01755476, "balance_loss_mlp": 1.00032067, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.7023055386252935, "language_loss": 0.57585275, "learning_rate": 1.924457723929012e-06, "loss": 0.59646475, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 3.2407994270324707 }, { "auxiliary_loss_clip": 0.01161836, "auxiliary_loss_mlp": 0.01025211, "balance_loss_clip": 1.05118442, "balance_loss_mlp": 1.0180403, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 1.5298503049488668, "language_loss": 0.82867837, "learning_rate": 1.9236793138929685e-06, "loss": 0.8505488, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 4.029268503189087 }, { "auxiliary_loss_clip": 0.01163627, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 1.05062282, "balance_loss_mlp": 1.02000141, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 1.9798652828125796, "language_loss": 0.81377637, "learning_rate": 1.9229009154347133e-06, "loss": 0.83568627, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.4519734382629395 }, { "auxiliary_loss_clip": 0.01104083, "auxiliary_loss_mlp": 0.00760668, "balance_loss_clip": 1.04409552, "balance_loss_mlp": 1.00022781, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 2.0605447722919195, "language_loss": 0.80200613, "learning_rate": 1.922122528672327e-06, "loss": 0.82065368, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 2.574641466140747 }, { "auxiliary_loss_clip": 0.01172337, "auxiliary_loss_mlp": 0.0103071, "balance_loss_clip": 1.05177867, "balance_loss_mlp": 1.02368295, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 2.1082110865424704, "language_loss": 0.78167868, "learning_rate": 1.9213441537238914e-06, "loss": 0.80370915, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.442627429962158 }, { "auxiliary_loss_clip": 0.01016101, "auxiliary_loss_mlp": 0.01003069, "balance_loss_clip": 1.01402915, "balance_loss_mlp": 1.00193644, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.8355061940010338, "language_loss": 0.57331359, "learning_rate": 1.920565790707485e-06, "loss": 0.59350526, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 3.329414129257202 }, { "auxiliary_loss_clip": 0.01123016, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 1.04630411, "balance_loss_mlp": 1.02088177, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 2.152839378443088, "language_loss": 0.66104859, "learning_rate": 1.9197874397411853e-06, "loss": 0.68256372, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 2.585705041885376 }, { "auxiliary_loss_clip": 0.0112922, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.04308963, "balance_loss_mlp": 1.02345622, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 2.9900514082391, "language_loss": 0.6748929, "learning_rate": 1.919009100943067e-06, "loss": 0.69650126, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.522519826889038 }, { "auxiliary_loss_clip": 0.01128089, "auxiliary_loss_mlp": 0.01026851, "balance_loss_clip": 1.04836178, "balance_loss_mlp": 1.01924229, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 2.3868783146357826, "language_loss": 0.65661532, "learning_rate": 1.9182307744312043e-06, "loss": 0.67816472, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 2.5790281295776367 }, { "auxiliary_loss_clip": 0.01149578, "auxiliary_loss_mlp": 0.0103169, "balance_loss_clip": 1.04896045, "balance_loss_mlp": 1.02455878, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 1.8419393749197537, "language_loss": 0.76744133, "learning_rate": 1.9174524603236676e-06, "loss": 0.78925407, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.538311004638672 }, { "auxiliary_loss_clip": 0.01145256, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.04928732, "balance_loss_mlp": 1.02230656, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 3.2793837429932085, "language_loss": 0.76191056, "learning_rate": 1.916674158738527e-06, "loss": 0.78366721, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 2.5299510955810547 }, { "auxiliary_loss_clip": 0.01128404, "auxiliary_loss_mlp": 0.00761792, "balance_loss_clip": 1.05044699, "balance_loss_mlp": 1.00023127, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 1.7718163330304455, "language_loss": 0.60480559, "learning_rate": 1.9158958697938506e-06, "loss": 0.62370759, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.529927968978882 }, { "auxiliary_loss_clip": 0.01141715, "auxiliary_loss_mlp": 0.01028375, "balance_loss_clip": 1.04901958, "balance_loss_mlp": 1.02050996, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 2.3830035577691557, "language_loss": 0.85510594, "learning_rate": 1.9151175936077032e-06, "loss": 0.87680686, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.528831958770752 }, { "auxiliary_loss_clip": 0.01154841, "auxiliary_loss_mlp": 0.01029287, "balance_loss_clip": 1.0495019, "balance_loss_mlp": 1.02175319, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.5434821597672859, "language_loss": 0.79221666, "learning_rate": 1.9143393302981507e-06, "loss": 0.81405795, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.4823575019836426 }, { "auxiliary_loss_clip": 0.0115038, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.05084443, "balance_loss_mlp": 1.02305579, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 1.751339009408559, "language_loss": 0.83231217, "learning_rate": 1.913561079983252e-06, "loss": 0.85412061, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 2.5289502143859863 }, { "auxiliary_loss_clip": 0.01145563, "auxiliary_loss_mlp": 0.0103548, "balance_loss_clip": 1.04647255, "balance_loss_mlp": 1.02740359, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 3.3557891984931927, "language_loss": 0.75133133, "learning_rate": 1.9127828427810693e-06, "loss": 0.77314174, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.5643362998962402 }, { "auxiliary_loss_clip": 0.01140194, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 1.04749572, "balance_loss_mlp": 1.02244639, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 3.0418485402451054, "language_loss": 0.80992818, "learning_rate": 1.9120046188096607e-06, "loss": 0.83163476, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.5681049823760986 }, { "auxiliary_loss_clip": 0.01144862, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.05195403, "balance_loss_mlp": 1.02271676, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 2.1932694368111165, "language_loss": 0.74121636, "learning_rate": 1.9112264081870804e-06, "loss": 0.76296556, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.51755690574646 }, { "auxiliary_loss_clip": 0.01131207, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 1.05012655, "balance_loss_mlp": 1.02092063, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 2.1024444516879237, "language_loss": 0.75577879, "learning_rate": 1.9104482110313843e-06, "loss": 0.77737838, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 2.6656744480133057 }, { "auxiliary_loss_clip": 0.01162435, "auxiliary_loss_mlp": 0.01027374, "balance_loss_clip": 1.05326867, "balance_loss_mlp": 1.02020359, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 1.8354255960446761, "language_loss": 0.74446607, "learning_rate": 1.909670027460623e-06, "loss": 0.7663641, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 3.256376266479492 }, { "auxiliary_loss_clip": 0.01159229, "auxiliary_loss_mlp": 0.0102698, "balance_loss_clip": 1.05163705, "balance_loss_mlp": 1.01967227, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 1.663156585027762, "language_loss": 0.71822214, "learning_rate": 1.908891857592847e-06, "loss": 0.74008423, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.5710830688476562 }, { "auxiliary_loss_clip": 0.01125132, "auxiliary_loss_mlp": 0.01025298, "balance_loss_clip": 1.04747391, "balance_loss_mlp": 1.01802003, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.2483149101764326, "language_loss": 0.9009974, "learning_rate": 1.9081137015461034e-06, "loss": 0.92250168, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 2.5242116451263428 }, { "auxiliary_loss_clip": 0.01112247, "auxiliary_loss_mlp": 0.01030732, "balance_loss_clip": 1.04861689, "balance_loss_mlp": 1.02292049, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 2.4552449448926343, "language_loss": 0.9046737, "learning_rate": 1.9073355594384383e-06, "loss": 0.92610347, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.5764408111572266 }, { "auxiliary_loss_clip": 0.01126094, "auxiliary_loss_mlp": 0.01034631, "balance_loss_clip": 1.04888248, "balance_loss_mlp": 1.02664399, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 2.285835116377706, "language_loss": 0.80538219, "learning_rate": 1.906557431387895e-06, "loss": 0.82698941, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.574779510498047 }, { "auxiliary_loss_clip": 0.01132037, "auxiliary_loss_mlp": 0.01030325, "balance_loss_clip": 1.0531019, "balance_loss_mlp": 1.02295518, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 2.0857706501144984, "language_loss": 0.78453469, "learning_rate": 1.905779317512516e-06, "loss": 0.80615824, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 3.3134806156158447 }, { "auxiliary_loss_clip": 0.01158816, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 1.0508244, "balance_loss_mlp": 1.01829457, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 2.6545430066243054, "language_loss": 0.80971444, "learning_rate": 1.9050012179303385e-06, "loss": 0.83155268, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 2.493962526321411 }, { "auxiliary_loss_clip": 0.01160663, "auxiliary_loss_mlp": 0.01028863, "balance_loss_clip": 1.0493691, "balance_loss_mlp": 1.02064085, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.7379806873767016, "language_loss": 0.69077814, "learning_rate": 1.904223132759401e-06, "loss": 0.71267343, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.4954264163970947 }, { "auxiliary_loss_clip": 0.01161765, "auxiliary_loss_mlp": 0.0102382, "balance_loss_clip": 1.05055285, "balance_loss_mlp": 1.01678061, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 3.7744787774356308, "language_loss": 0.68673629, "learning_rate": 1.9034450621177383e-06, "loss": 0.70859212, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 4.310419797897339 }, { "auxiliary_loss_clip": 0.01161079, "auxiliary_loss_mlp": 0.01030384, "balance_loss_clip": 1.05250859, "balance_loss_mlp": 1.02205729, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 2.023697770998391, "language_loss": 0.7009033, "learning_rate": 1.9026670061233824e-06, "loss": 0.7228179, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.4622395038604736 }, { "auxiliary_loss_clip": 0.01141148, "auxiliary_loss_mlp": 0.01028645, "balance_loss_clip": 1.05028999, "balance_loss_mlp": 1.02127159, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 1.826996368459706, "language_loss": 0.80667526, "learning_rate": 1.901888964894365e-06, "loss": 0.82837313, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.5433743000030518 }, { "auxiliary_loss_clip": 0.01174771, "auxiliary_loss_mlp": 0.01025956, "balance_loss_clip": 1.05117679, "balance_loss_mlp": 1.01829672, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 1.7406538165760426, "language_loss": 0.68002915, "learning_rate": 1.9011109385487134e-06, "loss": 0.70203638, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.493377208709717 }, { "auxiliary_loss_clip": 0.01176092, "auxiliary_loss_mlp": 0.0102995, "balance_loss_clip": 1.05364108, "balance_loss_mlp": 1.02223718, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 3.0419574743283335, "language_loss": 0.66576028, "learning_rate": 1.900332927204454e-06, "loss": 0.68782073, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 2.462980031967163 }, { "auxiliary_loss_clip": 0.01150751, "auxiliary_loss_mlp": 0.01029543, "balance_loss_clip": 1.04956889, "balance_loss_mlp": 1.02147245, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 1.7779364195046625, "language_loss": 0.7679646, "learning_rate": 1.8995549309796097e-06, "loss": 0.7897675, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.559704303741455 }, { "auxiliary_loss_clip": 0.0116517, "auxiliary_loss_mlp": 0.01028355, "balance_loss_clip": 1.05279732, "balance_loss_mlp": 1.02130973, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 3.9993124443153274, "language_loss": 0.76926541, "learning_rate": 1.8987769499922028e-06, "loss": 0.79120064, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.4937398433685303 }, { "auxiliary_loss_clip": 0.01159266, "auxiliary_loss_mlp": 0.00761142, "balance_loss_clip": 1.05210114, "balance_loss_mlp": 1.00022876, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 2.148915909986806, "language_loss": 0.70579553, "learning_rate": 1.897998984360252e-06, "loss": 0.72499955, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.499934196472168 }, { "auxiliary_loss_clip": 0.01144433, "auxiliary_loss_mlp": 0.01025323, "balance_loss_clip": 1.05083477, "balance_loss_mlp": 1.01859677, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 1.4505956442454657, "language_loss": 0.78499252, "learning_rate": 1.897221034201775e-06, "loss": 0.80669004, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 2.6479642391204834 }, { "auxiliary_loss_clip": 0.01131627, "auxiliary_loss_mlp": 0.01026016, "balance_loss_clip": 1.04696047, "balance_loss_mlp": 1.01912534, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 1.6868037061074566, "language_loss": 0.66852009, "learning_rate": 1.8964430996347842e-06, "loss": 0.6900965, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.6474878787994385 }, { "auxiliary_loss_clip": 0.01143994, "auxiliary_loss_mlp": 0.01036256, "balance_loss_clip": 1.04678059, "balance_loss_mlp": 1.02831697, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 1.614169259339339, "language_loss": 0.82227421, "learning_rate": 1.8956651807772931e-06, "loss": 0.84407675, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.5351674556732178 }, { "auxiliary_loss_clip": 0.0115948, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.05384541, "balance_loss_mlp": 1.02360392, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 1.7481634807499669, "language_loss": 0.83986986, "learning_rate": 1.8948872777473115e-06, "loss": 0.86177427, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.499054431915283 }, { "auxiliary_loss_clip": 0.01146993, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.04968715, "balance_loss_mlp": 1.03028321, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 1.6885319249249462, "language_loss": 0.63422692, "learning_rate": 1.8941093906628458e-06, "loss": 0.65607762, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.5424652099609375 }, { "auxiliary_loss_clip": 0.01140721, "auxiliary_loss_mlp": 0.01028946, "balance_loss_clip": 1.04687119, "balance_loss_mlp": 1.02145946, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 1.9045932746181244, "language_loss": 0.70768142, "learning_rate": 1.893331519641902e-06, "loss": 0.7293781, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.6080210208892822 }, { "auxiliary_loss_clip": 0.01119281, "auxiliary_loss_mlp": 0.01029903, "balance_loss_clip": 1.04257619, "balance_loss_mlp": 1.02188349, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 3.6919968066614866, "language_loss": 0.73719394, "learning_rate": 1.8925536648024815e-06, "loss": 0.75868583, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 2.567322015762329 }, { "auxiliary_loss_clip": 0.01174615, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.05221045, "balance_loss_mlp": 1.02046537, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 1.9861729381962434, "language_loss": 0.75827587, "learning_rate": 1.8917758262625849e-06, "loss": 0.78029895, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.5106558799743652 }, { "auxiliary_loss_clip": 0.01142491, "auxiliary_loss_mlp": 0.01029121, "balance_loss_clip": 1.049824, "balance_loss_mlp": 1.02188456, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 1.6406390786960448, "language_loss": 0.80905747, "learning_rate": 1.8909980041402089e-06, "loss": 0.83077359, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.5378849506378174 }, { "auxiliary_loss_clip": 0.01154046, "auxiliary_loss_mlp": 0.01029898, "balance_loss_clip": 1.04880512, "balance_loss_mlp": 1.02189898, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 2.3868523916218294, "language_loss": 0.65501714, "learning_rate": 1.8902201985533494e-06, "loss": 0.67685658, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.479931116104126 }, { "auxiliary_loss_clip": 0.01146561, "auxiliary_loss_mlp": 0.01031817, "balance_loss_clip": 1.05030584, "balance_loss_mlp": 1.02447963, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 1.6649935899486359, "language_loss": 0.74840987, "learning_rate": 1.8894424096199983e-06, "loss": 0.7701937, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 3.25303316116333 }, { "auxiliary_loss_clip": 0.01160153, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.05159712, "balance_loss_mlp": 1.02851999, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 1.9135010158834054, "language_loss": 0.85844207, "learning_rate": 1.8886646374581463e-06, "loss": 0.88040555, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.4940221309661865 }, { "auxiliary_loss_clip": 0.01156601, "auxiliary_loss_mlp": 0.01032727, "balance_loss_clip": 1.0490334, "balance_loss_mlp": 1.02496672, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 2.293364493754872, "language_loss": 0.71158993, "learning_rate": 1.8878868821857795e-06, "loss": 0.73348325, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 2.4957005977630615 }, { "auxiliary_loss_clip": 0.01112921, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.04328394, "balance_loss_mlp": 1.01894701, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 2.3541649837753225, "language_loss": 0.75268912, "learning_rate": 1.8871091439208838e-06, "loss": 0.7740885, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.712125062942505 }, { "auxiliary_loss_clip": 0.011154, "auxiliary_loss_mlp": 0.01033328, "balance_loss_clip": 1.04590797, "balance_loss_mlp": 1.02564502, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.368165545659527, "language_loss": 0.77642465, "learning_rate": 1.8863314227814414e-06, "loss": 0.79791188, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.6075961589813232 }, { "auxiliary_loss_clip": 0.01165231, "auxiliary_loss_mlp": 0.01025583, "balance_loss_clip": 1.05224514, "balance_loss_mlp": 1.01789403, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 2.2912272871090744, "language_loss": 0.48693445, "learning_rate": 1.8855537188854313e-06, "loss": 0.50884259, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 3.3498008251190186 }, { "auxiliary_loss_clip": 0.01160151, "auxiliary_loss_mlp": 0.01027839, "balance_loss_clip": 1.04789162, "balance_loss_mlp": 1.02033472, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 2.020206974813297, "language_loss": 0.78396046, "learning_rate": 1.8847760323508315e-06, "loss": 0.80584037, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 2.4697282314300537 }, { "auxiliary_loss_clip": 0.01141404, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.0496223, "balance_loss_mlp": 1.02447081, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 1.8023067303552525, "language_loss": 0.75665951, "learning_rate": 1.883998363295616e-06, "loss": 0.77839613, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 3.348810911178589 }, { "auxiliary_loss_clip": 0.01045385, "auxiliary_loss_mlp": 0.01002677, "balance_loss_clip": 1.01550305, "balance_loss_mlp": 1.00160432, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.9125730505056111, "language_loss": 0.62652493, "learning_rate": 1.8832207118377565e-06, "loss": 0.64700556, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 3.8186235427856445 }, { "auxiliary_loss_clip": 0.01172922, "auxiliary_loss_mlp": 0.01024899, "balance_loss_clip": 1.05188727, "balance_loss_mlp": 1.01766598, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 1.9228915631523582, "language_loss": 0.69224364, "learning_rate": 1.882443078095222e-06, "loss": 0.71422184, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.4902453422546387 }, { "auxiliary_loss_clip": 0.01029279, "auxiliary_loss_mlp": 0.01002644, "balance_loss_clip": 1.01856995, "balance_loss_mlp": 1.00144005, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 0.8701158485646526, "language_loss": 0.66783118, "learning_rate": 1.8816654621859794e-06, "loss": 0.68815041, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.097675085067749 }, { "auxiliary_loss_clip": 0.01169594, "auxiliary_loss_mlp": 0.0102353, "balance_loss_clip": 1.05089664, "balance_loss_mlp": 1.01584125, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.3394478657223168, "language_loss": 0.72325099, "learning_rate": 1.8808878642279915e-06, "loss": 0.74518222, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.447885036468506 }, { "auxiliary_loss_clip": 0.01130881, "auxiliary_loss_mlp": 0.01030151, "balance_loss_clip": 1.04192603, "balance_loss_mlp": 1.02234852, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 2.2045138127337864, "language_loss": 0.65519035, "learning_rate": 1.8801102843392209e-06, "loss": 0.67680073, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 2.59413743019104 }, { "auxiliary_loss_clip": 0.01131073, "auxiliary_loss_mlp": 0.01029531, "balance_loss_clip": 1.04727364, "balance_loss_mlp": 1.02203023, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.516046661611078, "language_loss": 0.85024703, "learning_rate": 1.8793327226376238e-06, "loss": 0.87185305, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.6353368759155273 }, { "auxiliary_loss_clip": 0.01150797, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.04963958, "balance_loss_mlp": 1.01991642, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 1.7027554336218107, "language_loss": 0.80287766, "learning_rate": 1.8785551792411569e-06, "loss": 0.82466352, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.544616937637329 }, { "auxiliary_loss_clip": 0.01145728, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.04948401, "balance_loss_mlp": 1.02278304, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 2.34673440018098, "language_loss": 0.82273519, "learning_rate": 1.8777776542677733e-06, "loss": 0.844491, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.5227468013763428 }, { "auxiliary_loss_clip": 0.01127763, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.04321957, "balance_loss_mlp": 1.02344775, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 1.8023244957121562, "language_loss": 0.72490841, "learning_rate": 1.8770001478354216e-06, "loss": 0.74649608, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.57855224609375 }, { "auxiliary_loss_clip": 0.01153997, "auxiliary_loss_mlp": 0.01032089, "balance_loss_clip": 1.04863811, "balance_loss_mlp": 1.0236187, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.135999440791739, "language_loss": 0.84045374, "learning_rate": 1.8762226600620504e-06, "loss": 0.86231458, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 2.489802598953247 }, { "auxiliary_loss_clip": 0.0115057, "auxiliary_loss_mlp": 0.01031583, "balance_loss_clip": 1.04802692, "balance_loss_mlp": 1.02331889, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 2.4171818755116226, "language_loss": 0.58958864, "learning_rate": 1.8754451910656031e-06, "loss": 0.61141014, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.46627140045166 }, { "auxiliary_loss_clip": 0.01124783, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.04706109, "balance_loss_mlp": 1.02270126, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 1.839153863723415, "language_loss": 0.82988954, "learning_rate": 1.8746677409640212e-06, "loss": 0.85144144, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.5980372428894043 }, { "auxiliary_loss_clip": 0.01163224, "auxiliary_loss_mlp": 0.01026952, "balance_loss_clip": 1.05195904, "balance_loss_mlp": 1.01920891, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 1.9269222256468725, "language_loss": 0.84881926, "learning_rate": 1.8738903098752432e-06, "loss": 0.8707211, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.539308547973633 }, { "auxiliary_loss_clip": 0.01146262, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.04908776, "balance_loss_mlp": 1.01895094, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.062658350395964, "language_loss": 0.73178345, "learning_rate": 1.8731128979172052e-06, "loss": 0.7535097, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 2.5703680515289307 }, { "auxiliary_loss_clip": 0.01142583, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.04870081, "balance_loss_mlp": 1.02289891, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.2962804503528385, "language_loss": 0.66764522, "learning_rate": 1.8723355052078394e-06, "loss": 0.6893723, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.6124227046966553 }, { "auxiliary_loss_clip": 0.01156624, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.04926443, "balance_loss_mlp": 1.02611268, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 2.337581893688643, "language_loss": 0.77254909, "learning_rate": 1.8715581318650765e-06, "loss": 0.79446185, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 2.500372886657715 }, { "auxiliary_loss_clip": 0.01142239, "auxiliary_loss_mlp": 0.01028428, "balance_loss_clip": 1.04916811, "balance_loss_mlp": 1.02041745, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 2.270237626143738, "language_loss": 0.82302612, "learning_rate": 1.8707807780068422e-06, "loss": 0.84473276, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.5262610912323 }, { "auxiliary_loss_clip": 0.01143574, "auxiliary_loss_mlp": 0.01026196, "balance_loss_clip": 1.04831672, "balance_loss_mlp": 1.0186621, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 2.5900882185817684, "language_loss": 0.66462624, "learning_rate": 1.8700034437510611e-06, "loss": 0.68632394, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.624983549118042 }, { "auxiliary_loss_clip": 0.01121282, "auxiliary_loss_mlp": 0.01027506, "balance_loss_clip": 1.04568756, "balance_loss_mlp": 1.02017725, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.175401096904989, "language_loss": 0.81153232, "learning_rate": 1.8692261292156549e-06, "loss": 0.83302015, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 3.3375275135040283 }, { "auxiliary_loss_clip": 0.01173132, "auxiliary_loss_mlp": 0.01029126, "balance_loss_clip": 1.05340767, "balance_loss_mlp": 1.0215981, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 1.8408051671204206, "language_loss": 0.81070155, "learning_rate": 1.8684488345185401e-06, "loss": 0.83272409, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 2.5121216773986816 }, { "auxiliary_loss_clip": 0.01177003, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.05446005, "balance_loss_mlp": 1.02372456, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 10.988948838778407, "language_loss": 0.77946466, "learning_rate": 1.8676715597776332e-06, "loss": 0.8015517, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.46423602104187 }, { "auxiliary_loss_clip": 0.01106696, "auxiliary_loss_mlp": 0.01023304, "balance_loss_clip": 1.04242778, "balance_loss_mlp": 1.01610935, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 1.7500527988517882, "language_loss": 0.75889862, "learning_rate": 1.8668943051108455e-06, "loss": 0.78019857, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.5958218574523926 }, { "auxiliary_loss_clip": 0.01143839, "auxiliary_loss_mlp": 0.01026388, "balance_loss_clip": 1.04769111, "balance_loss_mlp": 1.01826096, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 1.765273801880341, "language_loss": 0.76601541, "learning_rate": 1.8661170706360856e-06, "loss": 0.7877177, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.558471918106079 }, { "auxiliary_loss_clip": 0.01161158, "auxiliary_loss_mlp": 0.01026036, "balance_loss_clip": 1.05371547, "balance_loss_mlp": 1.01892531, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 1.5631177381213135, "language_loss": 0.81655216, "learning_rate": 1.8653398564712594e-06, "loss": 0.83842409, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 3.293365001678467 }, { "auxiliary_loss_clip": 0.01157913, "auxiliary_loss_mlp": 0.01028821, "balance_loss_clip": 1.05089378, "balance_loss_mlp": 1.02162075, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 1.8996444173804905, "language_loss": 0.81881469, "learning_rate": 1.8645626627342704e-06, "loss": 0.84068203, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 2.5387778282165527 }, { "auxiliary_loss_clip": 0.01162694, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.05122495, "balance_loss_mlp": 1.02299714, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.0141314830521475, "language_loss": 0.80981112, "learning_rate": 1.8637854895430172e-06, "loss": 0.83174098, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.510599374771118 }, { "auxiliary_loss_clip": 0.0112025, "auxiliary_loss_mlp": 0.01027348, "balance_loss_clip": 1.04441714, "balance_loss_mlp": 1.01928949, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.1173316283715775, "language_loss": 0.69382256, "learning_rate": 1.8630083370153978e-06, "loss": 0.71529853, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 3.3858582973480225 }, { "auxiliary_loss_clip": 0.01016352, "auxiliary_loss_mlp": 0.01004181, "balance_loss_clip": 1.01470375, "balance_loss_mlp": 1.00307262, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.7451312143488024, "language_loss": 0.55405617, "learning_rate": 1.8622312052693041e-06, "loss": 0.57426155, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.2886176109313965 }, { "auxiliary_loss_clip": 0.01151324, "auxiliary_loss_mlp": 0.01033527, "balance_loss_clip": 1.04578388, "balance_loss_mlp": 1.02609479, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.105664831456696, "language_loss": 0.72145271, "learning_rate": 1.8614540944226267e-06, "loss": 0.74330115, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 2.5095982551574707 }, { "auxiliary_loss_clip": 0.01141333, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 1.05006933, "balance_loss_mlp": 1.02214575, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.704474858223676, "language_loss": 0.67844009, "learning_rate": 1.8606770045932537e-06, "loss": 0.70014274, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.5441324710845947 }, { "auxiliary_loss_clip": 0.01121509, "auxiliary_loss_mlp": 0.01035583, "balance_loss_clip": 1.04023874, "balance_loss_mlp": 1.02767348, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 1.828827218882342, "language_loss": 0.81735229, "learning_rate": 1.859899935899068e-06, "loss": 0.83892322, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 2.614748954772949 }, { "auxiliary_loss_clip": 0.01143401, "auxiliary_loss_mlp": 0.01030956, "balance_loss_clip": 1.05098569, "balance_loss_mlp": 1.02317762, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 1.6608918783865303, "language_loss": 0.78986573, "learning_rate": 1.8591228884579506e-06, "loss": 0.81160927, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.521594762802124 }, { "auxiliary_loss_clip": 0.01136067, "auxiliary_loss_mlp": 0.01026697, "balance_loss_clip": 1.04796863, "balance_loss_mlp": 1.01956284, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 2.2011985068275712, "language_loss": 0.8212477, "learning_rate": 1.8583458623877795e-06, "loss": 0.84287536, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.5981311798095703 }, { "auxiliary_loss_clip": 0.0116146, "auxiliary_loss_mlp": 0.01021485, "balance_loss_clip": 1.05128348, "balance_loss_mlp": 1.01420176, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 1.817968359570094, "language_loss": 0.74478555, "learning_rate": 1.8575688578064281e-06, "loss": 0.76661503, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.465724468231201 }, { "auxiliary_loss_clip": 0.01162055, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.05199075, "balance_loss_mlp": 1.01702619, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 1.709303147357437, "language_loss": 0.76505345, "learning_rate": 1.8567918748317674e-06, "loss": 0.78692257, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.525341749191284 }, { "auxiliary_loss_clip": 0.0113031, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.04422677, "balance_loss_mlp": 1.02466643, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 1.9889036520477918, "language_loss": 0.83060801, "learning_rate": 1.8560149135816659e-06, "loss": 0.85223299, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 2.5402350425720215 }, { "auxiliary_loss_clip": 0.01152784, "auxiliary_loss_mlp": 0.01026837, "balance_loss_clip": 1.04588079, "balance_loss_mlp": 1.01952994, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.3633388977720795, "language_loss": 0.84287512, "learning_rate": 1.8552379741739873e-06, "loss": 0.86467135, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.500734806060791 }, { "auxiliary_loss_clip": 0.0103571, "auxiliary_loss_mlp": 0.00751224, "balance_loss_clip": 1.0147953, "balance_loss_mlp": 1.00020075, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 0.9125018518267768, "language_loss": 0.55738187, "learning_rate": 1.8544610567265935e-06, "loss": 0.57525122, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.1079840660095215 }, { "auxiliary_loss_clip": 0.01147531, "auxiliary_loss_mlp": 0.00760696, "balance_loss_clip": 1.05141044, "balance_loss_mlp": 1.00021029, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 1.9935672688120891, "language_loss": 0.83279264, "learning_rate": 1.853684161357341e-06, "loss": 0.85187489, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.532824993133545 }, { "auxiliary_loss_clip": 0.01155476, "auxiliary_loss_mlp": 0.00761138, "balance_loss_clip": 1.05097783, "balance_loss_mlp": 1.00020647, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 2.0970012599413446, "language_loss": 0.76886868, "learning_rate": 1.852907288184085e-06, "loss": 0.78803486, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.529510498046875 }, { "auxiliary_loss_clip": 0.0111844, "auxiliary_loss_mlp": 0.01032032, "balance_loss_clip": 1.04508138, "balance_loss_mlp": 1.02409244, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 2.0251511400939806, "language_loss": 0.69904172, "learning_rate": 1.8521304373246762e-06, "loss": 0.72054648, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 2.6825625896453857 }, { "auxiliary_loss_clip": 0.01160559, "auxiliary_loss_mlp": 0.01037969, "balance_loss_clip": 1.0489161, "balance_loss_mlp": 1.02932012, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 2.4484471137880277, "language_loss": 0.88913715, "learning_rate": 1.8513536088969626e-06, "loss": 0.91112244, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 2.5299735069274902 }, { "auxiliary_loss_clip": 0.01161013, "auxiliary_loss_mlp": 0.01035562, "balance_loss_clip": 1.05250239, "balance_loss_mlp": 1.02782845, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 1.6210206238210152, "language_loss": 0.80452764, "learning_rate": 1.8505768030187884e-06, "loss": 0.82649338, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.515212297439575 }, { "auxiliary_loss_clip": 0.01142302, "auxiliary_loss_mlp": 0.01029956, "balance_loss_clip": 1.05038404, "balance_loss_mlp": 1.02278531, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.468934461139235, "language_loss": 0.79741752, "learning_rate": 1.849800019807995e-06, "loss": 0.81914008, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 3.354647159576416 }, { "auxiliary_loss_clip": 0.01130145, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.04804325, "balance_loss_mlp": 1.02605641, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 1.8552170874302316, "language_loss": 0.71007848, "learning_rate": 1.8490232593824186e-06, "loss": 0.73171568, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.5758397579193115 }, { "auxiliary_loss_clip": 0.01144866, "auxiliary_loss_mlp": 0.01026737, "balance_loss_clip": 1.05040896, "balance_loss_mlp": 1.01971841, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 1.6504780033505906, "language_loss": 0.84555912, "learning_rate": 1.8482465218598935e-06, "loss": 0.86727524, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 2.575528144836426 }, { "auxiliary_loss_clip": 0.01129872, "auxiliary_loss_mlp": 0.01025152, "balance_loss_clip": 1.04638457, "balance_loss_mlp": 1.01725447, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 1.8055930768958812, "language_loss": 0.83314675, "learning_rate": 1.8474698073582508e-06, "loss": 0.85469699, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.601571559906006 }, { "auxiliary_loss_clip": 0.01135968, "auxiliary_loss_mlp": 0.01029195, "balance_loss_clip": 1.04698145, "balance_loss_mlp": 1.02200055, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 2.141489712044353, "language_loss": 0.87691998, "learning_rate": 1.8466931159953166e-06, "loss": 0.89857161, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.5692572593688965 }, { "auxiliary_loss_clip": 0.0114809, "auxiliary_loss_mlp": 0.0102632, "balance_loss_clip": 1.0509088, "balance_loss_mlp": 1.01818979, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 1.8220826673824824, "language_loss": 0.83873266, "learning_rate": 1.8459164478889158e-06, "loss": 0.86047673, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 2.5797581672668457 }, { "auxiliary_loss_clip": 0.01124031, "auxiliary_loss_mlp": 0.01027651, "balance_loss_clip": 1.04424179, "balance_loss_mlp": 1.0200603, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 1.7677118968228311, "language_loss": 0.75871253, "learning_rate": 1.8451398031568663e-06, "loss": 0.78022933, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 3.380509376525879 }, { "auxiliary_loss_clip": 0.01130063, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.04836655, "balance_loss_mlp": 1.02382874, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 2.5652906914081544, "language_loss": 0.74674416, "learning_rate": 1.844363181916986e-06, "loss": 0.76835692, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 2.6464931964874268 }, { "auxiliary_loss_clip": 0.01155997, "auxiliary_loss_mlp": 0.01026132, "balance_loss_clip": 1.05019522, "balance_loss_mlp": 1.01778698, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 1.6962975320754579, "language_loss": 0.83378923, "learning_rate": 1.8435865842870868e-06, "loss": 0.85561049, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 3.2903785705566406 }, { "auxiliary_loss_clip": 0.01136349, "auxiliary_loss_mlp": 0.00761666, "balance_loss_clip": 1.04597604, "balance_loss_mlp": 1.00019944, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 1.85705123718222, "language_loss": 0.71888125, "learning_rate": 1.8428100103849787e-06, "loss": 0.73786139, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 3.3344228267669678 }, { "auxiliary_loss_clip": 0.01144678, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.05186725, "balance_loss_mlp": 1.02066851, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 2.135677202505748, "language_loss": 0.72932041, "learning_rate": 1.842033460328467e-06, "loss": 0.75104737, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.510364294052124 }, { "auxiliary_loss_clip": 0.01147514, "auxiliary_loss_mlp": 0.00760921, "balance_loss_clip": 1.04751956, "balance_loss_mlp": 1.00020921, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 1.803426852019116, "language_loss": 0.75032246, "learning_rate": 1.8412569342353541e-06, "loss": 0.7694068, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.584289073944092 }, { "auxiliary_loss_clip": 0.01154079, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.05457401, "balance_loss_mlp": 1.02893174, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 1.9759234765766658, "language_loss": 0.84889162, "learning_rate": 1.840480432223438e-06, "loss": 0.87080383, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.5503032207489014 }, { "auxiliary_loss_clip": 0.01149372, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.04876661, "balance_loss_mlp": 1.0212599, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 3.709495855450797, "language_loss": 0.77414691, "learning_rate": 1.8397039544105131e-06, "loss": 0.7959221, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.590378761291504 }, { "auxiliary_loss_clip": 0.01137775, "auxiliary_loss_mlp": 0.01025817, "balance_loss_clip": 1.04409647, "balance_loss_mlp": 1.01784766, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 1.982253255251768, "language_loss": 0.69592452, "learning_rate": 1.8389275009143711e-06, "loss": 0.71756041, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.6660993099212646 }, { "auxiliary_loss_clip": 0.01173792, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 1.05329561, "balance_loss_mlp": 1.01897526, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 1.7354977439049641, "language_loss": 0.73378897, "learning_rate": 1.8381510718527988e-06, "loss": 0.75578773, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.524327516555786 }, { "auxiliary_loss_clip": 0.01145586, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 1.04594111, "balance_loss_mlp": 1.02248633, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 1.927023761829669, "language_loss": 0.63551515, "learning_rate": 1.8373746673435812e-06, "loss": 0.65727532, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.5774412155151367 }, { "auxiliary_loss_clip": 0.01174717, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.05405939, "balance_loss_mlp": 1.02153063, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 1.6486967081605235, "language_loss": 0.78771687, "learning_rate": 1.8365982875044964e-06, "loss": 0.80975604, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.523552656173706 }, { "auxiliary_loss_clip": 0.01164604, "auxiliary_loss_mlp": 0.00761842, "balance_loss_clip": 1.05187476, "balance_loss_mlp": 1.00020528, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 2.275372689746067, "language_loss": 0.76386875, "learning_rate": 1.8358219324533217e-06, "loss": 0.78313321, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.524921417236328 }, { "auxiliary_loss_clip": 0.01142394, "auxiliary_loss_mlp": 0.01025541, "balance_loss_clip": 1.04801798, "balance_loss_mlp": 1.01871896, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 1.745199544479273, "language_loss": 0.70424396, "learning_rate": 1.8350456023078292e-06, "loss": 0.7259233, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.6190342903137207 }, { "auxiliary_loss_clip": 0.01176764, "auxiliary_loss_mlp": 0.01035194, "balance_loss_clip": 1.05210423, "balance_loss_mlp": 1.02686083, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 2.0953170076416643, "language_loss": 0.78009176, "learning_rate": 1.8342692971857874e-06, "loss": 0.80221134, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.477479934692383 }, { "auxiliary_loss_clip": 0.01143397, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.04975176, "balance_loss_mlp": 1.0165056, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 2.3060717890601996, "language_loss": 0.71488214, "learning_rate": 1.833493017204962e-06, "loss": 0.73655701, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.556281566619873 }, { "auxiliary_loss_clip": 0.01172283, "auxiliary_loss_mlp": 0.01027442, "balance_loss_clip": 1.05061162, "balance_loss_mlp": 1.01999462, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 1.8808488507904944, "language_loss": 0.7801308, "learning_rate": 1.8327167624831134e-06, "loss": 0.80212808, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.4964852333068848 }, { "auxiliary_loss_clip": 0.01172707, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.05267334, "balance_loss_mlp": 1.0211935, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 1.7847695363344214, "language_loss": 0.70628053, "learning_rate": 1.831940533137999e-06, "loss": 0.72829401, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 2.477630615234375 }, { "auxiliary_loss_clip": 0.01158061, "auxiliary_loss_mlp": 0.0102923, "balance_loss_clip": 1.05260396, "balance_loss_mlp": 1.02188981, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 1.7170739771960104, "language_loss": 0.72135407, "learning_rate": 1.8311643292873718e-06, "loss": 0.74322701, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.5614125728607178 }, { "auxiliary_loss_clip": 0.01157025, "auxiliary_loss_mlp": 0.01030275, "balance_loss_clip": 1.05044675, "balance_loss_mlp": 1.02302086, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 1.8136881216515928, "language_loss": 0.87830108, "learning_rate": 1.8303881510489818e-06, "loss": 0.90017408, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 2.495850086212158 }, { "auxiliary_loss_clip": 0.01148965, "auxiliary_loss_mlp": 0.0103366, "balance_loss_clip": 1.05043435, "balance_loss_mlp": 1.02622414, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 1.9684557899779136, "language_loss": 0.69585156, "learning_rate": 1.829611998540574e-06, "loss": 0.71767783, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 3.368846893310547 }, { "auxiliary_loss_clip": 0.01157482, "auxiliary_loss_mlp": 0.00761328, "balance_loss_clip": 1.04896832, "balance_loss_mlp": 1.00022316, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 1.6846173387796528, "language_loss": 0.8000071, "learning_rate": 1.8288358718798914e-06, "loss": 0.81919521, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.5297632217407227 }, { "auxiliary_loss_clip": 0.01155633, "auxiliary_loss_mlp": 0.0076106, "balance_loss_clip": 1.05196702, "balance_loss_mlp": 1.00023913, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 2.2871008551117504, "language_loss": 0.7251265, "learning_rate": 1.8280597711846703e-06, "loss": 0.74429345, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.4851527214050293 }, { "auxiliary_loss_clip": 0.01154776, "auxiliary_loss_mlp": 0.01034451, "balance_loss_clip": 1.05053353, "balance_loss_mlp": 1.02679777, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 1.8814552803251026, "language_loss": 0.83340228, "learning_rate": 1.8272836965726455e-06, "loss": 0.85529459, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 2.5215506553649902 }, { "auxiliary_loss_clip": 0.01100691, "auxiliary_loss_mlp": 0.01029298, "balance_loss_clip": 1.04120219, "balance_loss_mlp": 1.02141821, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 1.7301348795031142, "language_loss": 0.78312564, "learning_rate": 1.8265076481615461e-06, "loss": 0.80442548, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.6491940021514893 }, { "auxiliary_loss_clip": 0.01146621, "auxiliary_loss_mlp": 0.01034919, "balance_loss_clip": 1.05332005, "balance_loss_mlp": 1.02703333, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 2.1755207207520444, "language_loss": 0.87128305, "learning_rate": 1.8257316260690987e-06, "loss": 0.89309841, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.508147954940796 }, { "auxiliary_loss_clip": 0.01156727, "auxiliary_loss_mlp": 0.0102686, "balance_loss_clip": 1.04938114, "balance_loss_mlp": 1.01974022, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 1.5970420017082163, "language_loss": 0.75961983, "learning_rate": 1.8249556304130254e-06, "loss": 0.78145564, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 3.299811840057373 }, { "auxiliary_loss_clip": 0.01133842, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.04428852, "balance_loss_mlp": 1.02056098, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 2.0632923057076344, "language_loss": 0.68370616, "learning_rate": 1.824179661311044e-06, "loss": 0.70532972, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 2.5958242416381836 }, { "auxiliary_loss_clip": 0.01115107, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.04215336, "balance_loss_mlp": 1.02261996, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 1.803659263850139, "language_loss": 0.79886162, "learning_rate": 1.823403718880868e-06, "loss": 0.8203094, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 3.4088265895843506 }, { "auxiliary_loss_clip": 0.01143507, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.04439902, "balance_loss_mlp": 1.02410853, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 1.781754194865738, "language_loss": 0.66485459, "learning_rate": 1.822627803240207e-06, "loss": 0.6866051, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 3.419186592102051 }, { "auxiliary_loss_clip": 0.01135781, "auxiliary_loss_mlp": 0.01027568, "balance_loss_clip": 1.04751039, "balance_loss_mlp": 1.02036488, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 1.9982019981274868, "language_loss": 0.85235786, "learning_rate": 1.8218519145067675e-06, "loss": 0.87399137, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.554074287414551 }, { "auxiliary_loss_clip": 0.01124083, "auxiliary_loss_mlp": 0.01033454, "balance_loss_clip": 1.04473352, "balance_loss_mlp": 1.02609575, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 1.772711627749825, "language_loss": 0.89455473, "learning_rate": 1.8210760527982508e-06, "loss": 0.91613013, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.5712273120880127 }, { "auxiliary_loss_clip": 0.01148283, "auxiliary_loss_mlp": 0.00760623, "balance_loss_clip": 1.05099165, "balance_loss_mlp": 1.000211, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 1.8169514762452057, "language_loss": 0.74991077, "learning_rate": 1.8203002182323552e-06, "loss": 0.76899981, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 2.5578527450561523 }, { "auxiliary_loss_clip": 0.01144746, "auxiliary_loss_mlp": 0.01031498, "balance_loss_clip": 1.04771018, "balance_loss_mlp": 1.02454519, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 1.8161369424481932, "language_loss": 0.75657761, "learning_rate": 1.819524410926773e-06, "loss": 0.77834004, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.5369153022766113 }, { "auxiliary_loss_clip": 0.01103555, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.04614091, "balance_loss_mlp": 1.02326441, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 1.420428915875673, "language_loss": 0.77123809, "learning_rate": 1.8187486309991944e-06, "loss": 0.7925815, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.627680778503418 }, { "auxiliary_loss_clip": 0.01162488, "auxiliary_loss_mlp": 0.01030583, "balance_loss_clip": 1.05113792, "balance_loss_mlp": 1.02363896, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 1.8368545068615874, "language_loss": 0.77168787, "learning_rate": 1.817972878567304e-06, "loss": 0.79361856, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.5082075595855713 }, { "auxiliary_loss_clip": 0.01149986, "auxiliary_loss_mlp": 0.01026037, "balance_loss_clip": 1.04822576, "balance_loss_mlp": 1.01847291, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 1.8084849392479445, "language_loss": 0.76467508, "learning_rate": 1.8171971537487834e-06, "loss": 0.78643531, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.5497963428497314 }, { "auxiliary_loss_clip": 0.01175975, "auxiliary_loss_mlp": 0.01033807, "balance_loss_clip": 1.05270016, "balance_loss_mlp": 1.0256803, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 1.9440421124742004, "language_loss": 0.81079578, "learning_rate": 1.8164214566613093e-06, "loss": 0.83289361, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 2.4588522911071777 }, { "auxiliary_loss_clip": 0.01170882, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.05079317, "balance_loss_mlp": 1.02199888, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 3.7631665621295713, "language_loss": 0.65725094, "learning_rate": 1.8156457874225547e-06, "loss": 0.67925185, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.4552416801452637 }, { "auxiliary_loss_clip": 0.01139593, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.05067205, "balance_loss_mlp": 1.02007651, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 1.7592235726309076, "language_loss": 0.80290997, "learning_rate": 1.814870146150187e-06, "loss": 0.82458448, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.5324466228485107 }, { "auxiliary_loss_clip": 0.01150186, "auxiliary_loss_mlp": 0.01033101, "balance_loss_clip": 1.04792738, "balance_loss_mlp": 1.02452993, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 1.9239791888381743, "language_loss": 0.78686953, "learning_rate": 1.814094532961871e-06, "loss": 0.80870235, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.5349996089935303 }, { "auxiliary_loss_clip": 0.01118216, "auxiliary_loss_mlp": 0.01027373, "balance_loss_clip": 1.04298949, "balance_loss_mlp": 1.02029848, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 1.9188857387357585, "language_loss": 0.83647019, "learning_rate": 1.8133189479752666e-06, "loss": 0.85792607, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.6168689727783203 }, { "auxiliary_loss_clip": 0.01172812, "auxiliary_loss_mlp": 0.01026924, "balance_loss_clip": 1.05243993, "balance_loss_mlp": 1.01946712, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 1.8368094410215154, "language_loss": 0.81802088, "learning_rate": 1.8125433913080292e-06, "loss": 0.84001827, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.4799692630767822 }, { "auxiliary_loss_clip": 0.01048274, "auxiliary_loss_mlp": 0.01032826, "balance_loss_clip": 1.03594112, "balance_loss_mlp": 1.02575374, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 2.6994370033089234, "language_loss": 0.82791448, "learning_rate": 1.811767863077811e-06, "loss": 0.8487255, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 2.914135694503784 }, { "auxiliary_loss_clip": 0.01094441, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.04537809, "balance_loss_mlp": 1.01935887, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.656496293934783, "language_loss": 0.78256232, "learning_rate": 1.8109923634022577e-06, "loss": 0.8037734, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 2.8230338096618652 }, { "auxiliary_loss_clip": 0.01176438, "auxiliary_loss_mlp": 0.01028634, "balance_loss_clip": 1.05319309, "balance_loss_mlp": 1.02120709, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 2.125115692683203, "language_loss": 0.86672091, "learning_rate": 1.8102168923990128e-06, "loss": 0.88877165, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.4410343170166016 }, { "auxiliary_loss_clip": 0.01160525, "auxiliary_loss_mlp": 0.00761243, "balance_loss_clip": 1.05057931, "balance_loss_mlp": 1.00024617, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 1.7039613556941138, "language_loss": 0.79923749, "learning_rate": 1.809441450185714e-06, "loss": 0.81845516, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 3.453847885131836 }, { "auxiliary_loss_clip": 0.01146962, "auxiliary_loss_mlp": 0.01030389, "balance_loss_clip": 1.04405475, "balance_loss_mlp": 1.02277184, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 1.999675578149202, "language_loss": 0.73321521, "learning_rate": 1.8086660368799958e-06, "loss": 0.75498873, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.5472123622894287 }, { "auxiliary_loss_clip": 0.01145623, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 1.04833508, "balance_loss_mlp": 1.02041864, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 3.1535506303916554, "language_loss": 0.77384734, "learning_rate": 1.807890652599488e-06, "loss": 0.79558384, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 2.632446527481079 }, { "auxiliary_loss_clip": 0.01169974, "auxiliary_loss_mlp": 0.01024509, "balance_loss_clip": 1.05073071, "balance_loss_mlp": 1.01796484, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 2.2201867164719298, "language_loss": 0.82376647, "learning_rate": 1.8071152974618156e-06, "loss": 0.84571135, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 2.444945812225342 }, { "auxiliary_loss_clip": 0.01131397, "auxiliary_loss_mlp": 0.00760998, "balance_loss_clip": 1.04531264, "balance_loss_mlp": 1.0001967, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.3115406773166693, "language_loss": 0.7847929, "learning_rate": 1.806339971584599e-06, "loss": 0.80371684, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.615262985229492 }, { "auxiliary_loss_clip": 0.01174283, "auxiliary_loss_mlp": 0.01027129, "balance_loss_clip": 1.05225706, "balance_loss_mlp": 1.02025354, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 2.0105623028128727, "language_loss": 0.85277081, "learning_rate": 1.8055646750854546e-06, "loss": 0.87478495, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.461588144302368 }, { "auxiliary_loss_clip": 0.01148912, "auxiliary_loss_mlp": 0.01031431, "balance_loss_clip": 1.04862261, "balance_loss_mlp": 1.02437115, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 3.9404851440783757, "language_loss": 0.815759, "learning_rate": 1.8047894080819945e-06, "loss": 0.83756238, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 3.4477219581604004 }, { "auxiliary_loss_clip": 0.01068408, "auxiliary_loss_mlp": 0.01001064, "balance_loss_clip": 1.0182842, "balance_loss_mlp": 0.99998468, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.7231144361059072, "language_loss": 0.63193268, "learning_rate": 1.8040141706918258e-06, "loss": 0.65262741, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 3.2205827236175537 }, { "auxiliary_loss_clip": 0.01146099, "auxiliary_loss_mlp": 0.01036797, "balance_loss_clip": 1.05124354, "balance_loss_mlp": 1.0294416, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 1.8177189080241651, "language_loss": 0.76761949, "learning_rate": 1.8032389630325525e-06, "loss": 0.78944844, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 3.3661441802978516 }, { "auxiliary_loss_clip": 0.0114362, "auxiliary_loss_mlp": 0.01024108, "balance_loss_clip": 1.04488063, "balance_loss_mlp": 1.01684821, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.6596121196440627, "language_loss": 0.75721228, "learning_rate": 1.8024637852217707e-06, "loss": 0.77888954, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 3.3125128746032715 }, { "auxiliary_loss_clip": 0.01145971, "auxiliary_loss_mlp": 0.01031654, "balance_loss_clip": 1.04975867, "balance_loss_mlp": 1.02489495, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 1.7108320441418228, "language_loss": 0.84732544, "learning_rate": 1.8016886373770766e-06, "loss": 0.8691017, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.5284061431884766 }, { "auxiliary_loss_clip": 0.01143021, "auxiliary_loss_mlp": 0.01030581, "balance_loss_clip": 1.04706335, "balance_loss_mlp": 1.02353549, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 1.644916313372015, "language_loss": 0.78873444, "learning_rate": 1.8009135196160579e-06, "loss": 0.81047046, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 2.560939311981201 }, { "auxiliary_loss_clip": 0.01125117, "auxiliary_loss_mlp": 0.01030086, "balance_loss_clip": 1.04413843, "balance_loss_mlp": 1.02297544, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 1.6988995798717297, "language_loss": 0.84048647, "learning_rate": 1.8001384320563e-06, "loss": 0.86203849, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.5556275844573975 }, { "auxiliary_loss_clip": 0.01067289, "auxiliary_loss_mlp": 0.01002623, "balance_loss_clip": 1.01714301, "balance_loss_mlp": 1.00154436, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.7714194369362715, "language_loss": 0.57780403, "learning_rate": 1.7993633748153833e-06, "loss": 0.59850311, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 2.9763591289520264 }, { "auxiliary_loss_clip": 0.01162699, "auxiliary_loss_mlp": 0.01033448, "balance_loss_clip": 1.05059314, "balance_loss_mlp": 1.02566361, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 1.828538665541505, "language_loss": 0.72662723, "learning_rate": 1.7985883480108834e-06, "loss": 0.74858868, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.4843533039093018 }, { "auxiliary_loss_clip": 0.01153242, "auxiliary_loss_mlp": 0.01029447, "balance_loss_clip": 1.04845071, "balance_loss_mlp": 1.02203846, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 1.5135727600232125, "language_loss": 0.72103, "learning_rate": 1.797813351760371e-06, "loss": 0.74285686, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.5319628715515137 }, { "auxiliary_loss_clip": 0.01173669, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.05092025, "balance_loss_mlp": 1.0224576, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 1.6934831955837053, "language_loss": 0.78101063, "learning_rate": 1.7970383861814116e-06, "loss": 0.80305398, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.484126091003418 }, { "auxiliary_loss_clip": 0.0116058, "auxiliary_loss_mlp": 0.01029918, "balance_loss_clip": 1.05111182, "balance_loss_mlp": 1.0222683, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 1.8501570565190402, "language_loss": 0.73714715, "learning_rate": 1.7962634513915684e-06, "loss": 0.75905216, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 2.478583574295044 }, { "auxiliary_loss_clip": 0.01170668, "auxiliary_loss_mlp": 0.01026131, "balance_loss_clip": 1.05007172, "balance_loss_mlp": 1.01894236, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.6417209251406886, "language_loss": 0.7935577, "learning_rate": 1.7954885475083969e-06, "loss": 0.81552565, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.455613851547241 }, { "auxiliary_loss_clip": 0.01175657, "auxiliary_loss_mlp": 0.0102767, "balance_loss_clip": 1.05358899, "balance_loss_mlp": 1.01974249, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 1.9834666222739512, "language_loss": 0.72459865, "learning_rate": 1.7947136746494513e-06, "loss": 0.74663192, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.5557448863983154 }, { "auxiliary_loss_clip": 0.01156711, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.05012679, "balance_loss_mlp": 1.02667999, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 1.9321732620102325, "language_loss": 0.87753856, "learning_rate": 1.793938832932277e-06, "loss": 0.89944869, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.5109670162200928 }, { "auxiliary_loss_clip": 0.01173557, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.05136085, "balance_loss_mlp": 1.02087426, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 1.829343692529943, "language_loss": 0.70278496, "learning_rate": 1.7931640224744185e-06, "loss": 0.72480607, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.492534637451172 }, { "auxiliary_loss_clip": 0.01117806, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.04053283, "balance_loss_mlp": 1.02150655, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 2.260880458872386, "language_loss": 0.73672605, "learning_rate": 1.7923892433934127e-06, "loss": 0.75819409, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.633251428604126 }, { "auxiliary_loss_clip": 0.01145147, "auxiliary_loss_mlp": 0.0076131, "balance_loss_clip": 1.04831123, "balance_loss_mlp": 1.0002234, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 1.6664445758579618, "language_loss": 0.78824514, "learning_rate": 1.7916144958067939e-06, "loss": 0.80730969, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.517235517501831 }, { "auxiliary_loss_clip": 0.01160045, "auxiliary_loss_mlp": 0.0102216, "balance_loss_clip": 1.04984426, "balance_loss_mlp": 1.01506388, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 1.750205297808644, "language_loss": 0.792377, "learning_rate": 1.7908397798320905e-06, "loss": 0.81419903, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.491636037826538 }, { "auxiliary_loss_clip": 0.0116203, "auxiliary_loss_mlp": 0.00761608, "balance_loss_clip": 1.05250609, "balance_loss_mlp": 1.0002079, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 1.6173497418781952, "language_loss": 0.74571586, "learning_rate": 1.7900650955868265e-06, "loss": 0.76495218, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 2.4878342151641846 }, { "auxiliary_loss_clip": 0.01163259, "auxiliary_loss_mlp": 0.00761305, "balance_loss_clip": 1.05455184, "balance_loss_mlp": 1.0002178, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.3778755152156756, "language_loss": 0.76573819, "learning_rate": 1.7892904431885202e-06, "loss": 0.78498387, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.76833176612854 }, { "auxiliary_loss_clip": 0.01117304, "auxiliary_loss_mlp": 0.01026813, "balance_loss_clip": 1.0427562, "balance_loss_mlp": 1.02029538, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 1.7712828200376882, "language_loss": 0.75249958, "learning_rate": 1.788515822754686e-06, "loss": 0.77394074, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 3.402116537094116 }, { "auxiliary_loss_clip": 0.0112919, "auxiliary_loss_mlp": 0.01028574, "balance_loss_clip": 1.04337561, "balance_loss_mlp": 1.02060485, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 2.107175163416886, "language_loss": 0.78708494, "learning_rate": 1.7877412344028335e-06, "loss": 0.80866253, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 2.570615530014038 }, { "auxiliary_loss_clip": 0.0116199, "auxiliary_loss_mlp": 0.0102634, "balance_loss_clip": 1.05012286, "balance_loss_mlp": 1.01919341, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 2.4578022591741076, "language_loss": 0.77394599, "learning_rate": 1.7869666782504668e-06, "loss": 0.7958293, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 2.461474657058716 }, { "auxiliary_loss_clip": 0.01133866, "auxiliary_loss_mlp": 0.01026149, "balance_loss_clip": 1.04528844, "balance_loss_mlp": 1.01891243, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 1.8186025090460671, "language_loss": 0.69103527, "learning_rate": 1.7861921544150867e-06, "loss": 0.7126354, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.5157859325408936 }, { "auxiliary_loss_clip": 0.01090958, "auxiliary_loss_mlp": 0.00761339, "balance_loss_clip": 1.04270601, "balance_loss_mlp": 1.00019062, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 1.7055631689701296, "language_loss": 0.76412821, "learning_rate": 1.7854176630141856e-06, "loss": 0.78265119, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 2.6016974449157715 }, { "auxiliary_loss_clip": 0.01176187, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.05330229, "balance_loss_mlp": 1.02183819, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 2.1427418609319004, "language_loss": 0.84529459, "learning_rate": 1.784643204165255e-06, "loss": 0.86735582, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 3.2781002521514893 }, { "auxiliary_loss_clip": 0.01152323, "auxiliary_loss_mlp": 0.01026916, "balance_loss_clip": 1.0506742, "balance_loss_mlp": 1.01950729, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 1.9663156552665761, "language_loss": 0.77177197, "learning_rate": 1.7838687779857783e-06, "loss": 0.79356432, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 2.4860503673553467 }, { "auxiliary_loss_clip": 0.0113656, "auxiliary_loss_mlp": 0.01028841, "balance_loss_clip": 1.04474592, "balance_loss_mlp": 1.02136338, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 1.7899858350517748, "language_loss": 0.64244628, "learning_rate": 1.7830943845932366e-06, "loss": 0.66410035, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 3.2955172061920166 }, { "auxiliary_loss_clip": 0.01147174, "auxiliary_loss_mlp": 0.01024357, "balance_loss_clip": 1.04909444, "balance_loss_mlp": 1.01722217, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 4.059680989245851, "language_loss": 0.75091404, "learning_rate": 1.7823200241051044e-06, "loss": 0.77262926, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 3.34077787399292 }, { "auxiliary_loss_clip": 0.01175585, "auxiliary_loss_mlp": 0.01028363, "balance_loss_clip": 1.05474854, "balance_loss_mlp": 1.02142513, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 1.8910477433949866, "language_loss": 0.80734062, "learning_rate": 1.7815456966388513e-06, "loss": 0.8293801, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.4698710441589355 }, { "auxiliary_loss_clip": 0.01130069, "auxiliary_loss_mlp": 0.01027893, "balance_loss_clip": 1.04536223, "balance_loss_mlp": 1.02038836, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 1.9555679514594209, "language_loss": 0.80994964, "learning_rate": 1.780771402311943e-06, "loss": 0.83152926, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.5577876567840576 }, { "auxiliary_loss_clip": 0.01147415, "auxiliary_loss_mlp": 0.01028587, "balance_loss_clip": 1.05255055, "balance_loss_mlp": 1.02125597, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 1.6599988998710056, "language_loss": 0.7865994, "learning_rate": 1.7799971412418374e-06, "loss": 0.8083595, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 2.534700393676758 }, { "auxiliary_loss_clip": 0.01128641, "auxiliary_loss_mlp": 0.01034545, "balance_loss_clip": 1.04632866, "balance_loss_mlp": 1.02618885, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 2.0046325819587287, "language_loss": 0.7382971, "learning_rate": 1.7792229135459918e-06, "loss": 0.75992894, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.522423505783081 }, { "auxiliary_loss_clip": 0.01022027, "auxiliary_loss_mlp": 0.01004236, "balance_loss_clip": 1.01978636, "balance_loss_mlp": 1.00310314, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.7343310433939847, "language_loss": 0.61704183, "learning_rate": 1.7784487193418538e-06, "loss": 0.63730448, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.0791804790496826 }, { "auxiliary_loss_clip": 0.01114263, "auxiliary_loss_mlp": 0.0102549, "balance_loss_clip": 1.04030704, "balance_loss_mlp": 1.01722026, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 1.7656708256883211, "language_loss": 0.61381847, "learning_rate": 1.7776745587468698e-06, "loss": 0.635216, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 2.523169755935669 }, { "auxiliary_loss_clip": 0.01170477, "auxiliary_loss_mlp": 0.01027694, "balance_loss_clip": 1.0485729, "balance_loss_mlp": 1.02011275, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 2.2555221531247156, "language_loss": 0.81925845, "learning_rate": 1.7769004318784776e-06, "loss": 0.84124017, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.4462785720825195 }, { "auxiliary_loss_clip": 0.0115996, "auxiliary_loss_mlp": 0.010267, "balance_loss_clip": 1.04938567, "balance_loss_mlp": 1.01895773, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 1.744062514430034, "language_loss": 0.80500263, "learning_rate": 1.776126338854113e-06, "loss": 0.82686925, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 2.4639976024627686 }, { "auxiliary_loss_clip": 0.01152624, "auxiliary_loss_mlp": 0.01025076, "balance_loss_clip": 1.04954004, "balance_loss_mlp": 1.01815319, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.5742426786391595, "language_loss": 0.84396642, "learning_rate": 1.7753522797912044e-06, "loss": 0.8657434, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.531416177749634 }, { "auxiliary_loss_clip": 0.01150438, "auxiliary_loss_mlp": 0.01028978, "balance_loss_clip": 1.04715669, "balance_loss_mlp": 1.02133048, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 2.2784223848045064, "language_loss": 0.69566965, "learning_rate": 1.7745782548071765e-06, "loss": 0.71746379, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.509826898574829 }, { "auxiliary_loss_clip": 0.01123955, "auxiliary_loss_mlp": 0.01029468, "balance_loss_clip": 1.04771638, "balance_loss_mlp": 1.02252364, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 1.6586114935805851, "language_loss": 0.74221957, "learning_rate": 1.7738042640194482e-06, "loss": 0.76375383, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.5439114570617676 }, { "auxiliary_loss_clip": 0.01170217, "auxiliary_loss_mlp": 0.0102612, "balance_loss_clip": 1.04890943, "balance_loss_mlp": 1.01859164, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 1.5356503032671067, "language_loss": 0.70789921, "learning_rate": 1.7730303075454335e-06, "loss": 0.72986257, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.4466545581817627 }, { "auxiliary_loss_clip": 0.01133062, "auxiliary_loss_mlp": 0.01037204, "balance_loss_clip": 1.04504657, "balance_loss_mlp": 1.02987289, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 2.562781383827615, "language_loss": 0.84948564, "learning_rate": 1.7722563855025402e-06, "loss": 0.87118828, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 2.5518813133239746 }, { "auxiliary_loss_clip": 0.01142781, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 1.04436898, "balance_loss_mlp": 1.01886439, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 2.1242126012427676, "language_loss": 0.70565581, "learning_rate": 1.7714824980081721e-06, "loss": 0.72734857, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.577956438064575 }, { "auxiliary_loss_clip": 0.01156753, "auxiliary_loss_mlp": 0.01025, "balance_loss_clip": 1.0519948, "balance_loss_mlp": 1.01735926, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 1.7776206477358734, "language_loss": 0.73763883, "learning_rate": 1.7707086451797276e-06, "loss": 0.7594564, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.5243818759918213 }, { "auxiliary_loss_clip": 0.01035108, "auxiliary_loss_mlp": 0.0100029, "balance_loss_clip": 1.01422596, "balance_loss_mlp": 0.99912137, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.6977149925371015, "language_loss": 0.52379906, "learning_rate": 1.7699348271345993e-06, "loss": 0.54415298, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.0923850536346436 }, { "auxiliary_loss_clip": 0.01030531, "auxiliary_loss_mlp": 0.01000115, "balance_loss_clip": 1.01635742, "balance_loss_mlp": 0.99888676, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 0.7126373152363171, "language_loss": 0.54478097, "learning_rate": 1.7691610439901753e-06, "loss": 0.56508744, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.262882947921753 }, { "auxiliary_loss_clip": 0.01160494, "auxiliary_loss_mlp": 0.01033492, "balance_loss_clip": 1.05000532, "balance_loss_mlp": 1.02638984, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 1.7906486332283178, "language_loss": 0.75606775, "learning_rate": 1.7683872958638367e-06, "loss": 0.77800763, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 3.25884747505188 }, { "auxiliary_loss_clip": 0.01140895, "auxiliary_loss_mlp": 0.01028802, "balance_loss_clip": 1.04528356, "balance_loss_mlp": 1.02155149, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 2.4949820609558615, "language_loss": 0.83936059, "learning_rate": 1.7676135828729614e-06, "loss": 0.86105764, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 2.5592641830444336 }, { "auxiliary_loss_clip": 0.01157778, "auxiliary_loss_mlp": 0.01027056, "balance_loss_clip": 1.05166364, "balance_loss_mlp": 1.01915836, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 4.714340527888407, "language_loss": 0.82974112, "learning_rate": 1.7668399051349205e-06, "loss": 0.8515895, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 2.5226328372955322 }, { "auxiliary_loss_clip": 0.01125208, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.04520416, "balance_loss_mlp": 1.02044725, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 1.8477823110538816, "language_loss": 0.83591437, "learning_rate": 1.766066262767081e-06, "loss": 0.85744643, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.574920177459717 }, { "auxiliary_loss_clip": 0.01140331, "auxiliary_loss_mlp": 0.01025159, "balance_loss_clip": 1.0485034, "balance_loss_mlp": 1.01798558, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 2.270652175854672, "language_loss": 0.76980036, "learning_rate": 1.765292655886803e-06, "loss": 0.79145527, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 2.5350422859191895 }, { "auxiliary_loss_clip": 0.01137724, "auxiliary_loss_mlp": 0.01029298, "balance_loss_clip": 1.04688191, "balance_loss_mlp": 1.02231586, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 1.7868467903114176, "language_loss": 0.70571232, "learning_rate": 1.764519084611443e-06, "loss": 0.7273826, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 3.395937919616699 }, { "auxiliary_loss_clip": 0.01141787, "auxiliary_loss_mlp": 0.01027977, "balance_loss_clip": 1.04618323, "balance_loss_mlp": 1.02019858, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 1.919060028342727, "language_loss": 0.77785206, "learning_rate": 1.7637455490583505e-06, "loss": 0.7995497, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 2.5202691555023193 }, { "auxiliary_loss_clip": 0.01160296, "auxiliary_loss_mlp": 0.01027172, "balance_loss_clip": 1.05123818, "balance_loss_mlp": 1.02049959, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 1.9198999756093793, "language_loss": 0.77259994, "learning_rate": 1.7629720493448701e-06, "loss": 0.79447466, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 3.2994871139526367 }, { "auxiliary_loss_clip": 0.01149638, "auxiliary_loss_mlp": 0.0102815, "balance_loss_clip": 1.04642558, "balance_loss_mlp": 1.02020431, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 1.755884411483945, "language_loss": 0.84661424, "learning_rate": 1.7621985855883418e-06, "loss": 0.86839211, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 3.2289061546325684 }, { "auxiliary_loss_clip": 0.01137643, "auxiliary_loss_mlp": 0.01025895, "balance_loss_clip": 1.04694676, "balance_loss_mlp": 1.01895738, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 1.7308637367114907, "language_loss": 0.72175002, "learning_rate": 1.7614251579060983e-06, "loss": 0.74338543, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.514972686767578 }, { "auxiliary_loss_clip": 0.01130647, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.04660034, "balance_loss_mlp": 1.01999509, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 1.7653783526061013, "language_loss": 0.84728837, "learning_rate": 1.76065176641547e-06, "loss": 0.86886811, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.635100841522217 }, { "auxiliary_loss_clip": 0.01156072, "auxiliary_loss_mlp": 0.01025392, "balance_loss_clip": 1.04614949, "balance_loss_mlp": 1.01855254, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 1.784774042962693, "language_loss": 0.77672464, "learning_rate": 1.759878411233777e-06, "loss": 0.79853928, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 2.5189290046691895 }, { "auxiliary_loss_clip": 0.01158806, "auxiliary_loss_mlp": 0.01024598, "balance_loss_clip": 1.05024934, "balance_loss_mlp": 1.0173173, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.06873187758352, "language_loss": 0.76119184, "learning_rate": 1.7591050924783388e-06, "loss": 0.78302592, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.4737982749938965 }, { "auxiliary_loss_clip": 0.01021753, "auxiliary_loss_mlp": 0.01005109, "balance_loss_clip": 1.01356292, "balance_loss_mlp": 1.00386906, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 0.8365527394717246, "language_loss": 0.57946771, "learning_rate": 1.7583318102664661e-06, "loss": 0.59973633, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.2207822799682617 }, { "auxiliary_loss_clip": 0.01157113, "auxiliary_loss_mlp": 0.01025458, "balance_loss_clip": 1.04565787, "balance_loss_mlp": 1.01822519, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 1.845825808185984, "language_loss": 0.79114807, "learning_rate": 1.757558564715466e-06, "loss": 0.81297374, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 2.4805736541748047 }, { "auxiliary_loss_clip": 0.01156231, "auxiliary_loss_mlp": 0.01037088, "balance_loss_clip": 1.04692173, "balance_loss_mlp": 1.0293808, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 2.8190629638858096, "language_loss": 0.74365175, "learning_rate": 1.7567853559426386e-06, "loss": 0.76558495, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.492171287536621 }, { "auxiliary_loss_clip": 0.01159664, "auxiliary_loss_mlp": 0.01027908, "balance_loss_clip": 1.04950225, "balance_loss_mlp": 1.02094007, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 2.8282993587365755, "language_loss": 0.74925625, "learning_rate": 1.7560121840652797e-06, "loss": 0.77113199, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 2.52915620803833 }, { "auxiliary_loss_clip": 0.01122148, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.04566026, "balance_loss_mlp": 1.02411628, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 2.552780706430194, "language_loss": 0.68997312, "learning_rate": 1.7552390492006782e-06, "loss": 0.71150959, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.588440179824829 }, { "auxiliary_loss_clip": 0.01118237, "auxiliary_loss_mlp": 0.00761326, "balance_loss_clip": 1.04118967, "balance_loss_mlp": 1.00025082, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 1.9736144122948742, "language_loss": 0.65401798, "learning_rate": 1.7544659514661184e-06, "loss": 0.67281365, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.6811771392822266 }, { "auxiliary_loss_clip": 0.01138025, "auxiliary_loss_mlp": 0.01027386, "balance_loss_clip": 1.0448494, "balance_loss_mlp": 1.02048695, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.3820874255264832, "language_loss": 0.79514092, "learning_rate": 1.7536928909788786e-06, "loss": 0.81679505, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.554486036300659 }, { "auxiliary_loss_clip": 0.01025663, "auxiliary_loss_mlp": 0.01001878, "balance_loss_clip": 1.01687181, "balance_loss_mlp": 1.00070941, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.8806827377861195, "language_loss": 0.61988175, "learning_rate": 1.752919867856231e-06, "loss": 0.6401571, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 2.9947681427001953 }, { "auxiliary_loss_clip": 0.01135255, "auxiliary_loss_mlp": 0.01024353, "balance_loss_clip": 1.04454625, "balance_loss_mlp": 1.01702142, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 1.9156916834007982, "language_loss": 0.78558278, "learning_rate": 1.7521468822154436e-06, "loss": 0.80717885, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.5461394786834717 }, { "auxiliary_loss_clip": 0.01137619, "auxiliary_loss_mlp": 0.01027113, "balance_loss_clip": 1.04837298, "balance_loss_mlp": 1.02017486, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 1.7866157117209083, "language_loss": 0.75305021, "learning_rate": 1.751373934173777e-06, "loss": 0.77469754, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 2.657331943511963 }, { "auxiliary_loss_clip": 0.01169563, "auxiliary_loss_mlp": 0.0103213, "balance_loss_clip": 1.04825735, "balance_loss_mlp": 1.02441132, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 1.602646427351878, "language_loss": 0.72952628, "learning_rate": 1.750601023848487e-06, "loss": 0.75154316, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.5450479984283447 }, { "auxiliary_loss_clip": 0.01169733, "auxiliary_loss_mlp": 0.00760857, "balance_loss_clip": 1.05099392, "balance_loss_mlp": 1.00021529, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 1.7861673179586828, "language_loss": 0.73968458, "learning_rate": 1.749828151356823e-06, "loss": 0.75899047, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.486889123916626 }, { "auxiliary_loss_clip": 0.01144102, "auxiliary_loss_mlp": 0.01024012, "balance_loss_clip": 1.04735708, "balance_loss_mlp": 1.01693368, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 2.1854740239586605, "language_loss": 0.75353616, "learning_rate": 1.7490553168160297e-06, "loss": 0.77521729, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 3.258605718612671 }, { "auxiliary_loss_clip": 0.01140971, "auxiliary_loss_mlp": 0.01026212, "balance_loss_clip": 1.04708433, "balance_loss_mlp": 1.01924133, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 1.906224535031859, "language_loss": 0.76243055, "learning_rate": 1.748282520343345e-06, "loss": 0.78410238, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.523167848587036 }, { "auxiliary_loss_clip": 0.01162751, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 1.04967904, "balance_loss_mlp": 1.02179015, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 1.9139045386992566, "language_loss": 0.78512293, "learning_rate": 1.7475097620560023e-06, "loss": 0.80704546, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 2.544057607650757 }, { "auxiliary_loss_clip": 0.01168974, "auxiliary_loss_mlp": 0.01024899, "balance_loss_clip": 1.04892743, "balance_loss_mlp": 1.01739728, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 1.766375373592864, "language_loss": 0.70981735, "learning_rate": 1.746737042071228e-06, "loss": 0.73175609, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 2.4824166297912598 }, { "auxiliary_loss_clip": 0.01139466, "auxiliary_loss_mlp": 0.01027399, "balance_loss_clip": 1.04861856, "balance_loss_mlp": 1.02054739, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 1.7818094951726855, "language_loss": 0.78971183, "learning_rate": 1.7459643605062424e-06, "loss": 0.81138045, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.528555154800415 }, { "auxiliary_loss_clip": 0.01110759, "auxiliary_loss_mlp": 0.0102397, "balance_loss_clip": 1.04478633, "balance_loss_mlp": 1.01626277, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 2.5639698721382285, "language_loss": 0.80443585, "learning_rate": 1.745191717478262e-06, "loss": 0.82578313, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 2.583462715148926 }, { "auxiliary_loss_clip": 0.011365, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.04670727, "balance_loss_mlp": 1.02034855, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 1.6110318002346469, "language_loss": 0.79540324, "learning_rate": 1.7444191131044948e-06, "loss": 0.81704402, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 3.362720012664795 }, { "auxiliary_loss_clip": 0.01142719, "auxiliary_loss_mlp": 0.01030521, "balance_loss_clip": 1.04887104, "balance_loss_mlp": 1.02287114, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 2.0717871138498727, "language_loss": 0.72880232, "learning_rate": 1.7436465475021456e-06, "loss": 0.75053477, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.533323287963867 }, { "auxiliary_loss_clip": 0.01121071, "auxiliary_loss_mlp": 0.01025022, "balance_loss_clip": 1.04528475, "balance_loss_mlp": 1.01785493, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 1.8864294852591454, "language_loss": 0.71657252, "learning_rate": 1.7428740207884111e-06, "loss": 0.73803353, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 3.3799476623535156 }, { "auxiliary_loss_clip": 0.01113399, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.04592609, "balance_loss_mlp": 1.02073312, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 1.8150752715832288, "language_loss": 0.60730755, "learning_rate": 1.7421015330804833e-06, "loss": 0.6287241, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 3.5030436515808105 }, { "auxiliary_loss_clip": 0.011691, "auxiliary_loss_mlp": 0.01023656, "balance_loss_clip": 1.0490458, "balance_loss_mlp": 1.01652694, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 1.9619490981396115, "language_loss": 0.72246337, "learning_rate": 1.7413290844955475e-06, "loss": 0.7443909, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.5132598876953125 }, { "auxiliary_loss_clip": 0.01147811, "auxiliary_loss_mlp": 0.01034777, "balance_loss_clip": 1.04841411, "balance_loss_mlp": 1.02743351, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 2.487092648152689, "language_loss": 0.7791363, "learning_rate": 1.7405566751507843e-06, "loss": 0.80096221, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.5018117427825928 }, { "auxiliary_loss_clip": 0.01126315, "auxiliary_loss_mlp": 0.01021726, "balance_loss_clip": 1.04531455, "balance_loss_mlp": 1.01446605, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.9796996779566516, "language_loss": 0.67380661, "learning_rate": 1.7397843051633668e-06, "loss": 0.69528699, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 2.8374552726745605 }, { "auxiliary_loss_clip": 0.01151864, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.04822016, "balance_loss_mlp": 1.02189994, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.591226271172385, "language_loss": 0.71194839, "learning_rate": 1.739011974650464e-06, "loss": 0.73376286, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.531893014907837 }, { "auxiliary_loss_clip": 0.01119378, "auxiliary_loss_mlp": 0.01024682, "balance_loss_clip": 1.04598963, "balance_loss_mlp": 1.01673651, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 2.336783754261743, "language_loss": 0.76673329, "learning_rate": 1.7382396837292365e-06, "loss": 0.78817391, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.672349691390991 }, { "auxiliary_loss_clip": 0.01171932, "auxiliary_loss_mlp": 0.0103461, "balance_loss_clip": 1.05094671, "balance_loss_mlp": 1.02641439, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 1.6931209063671189, "language_loss": 0.73431176, "learning_rate": 1.737467432516841e-06, "loss": 0.75637716, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.481193780899048 }, { "auxiliary_loss_clip": 0.01141303, "auxiliary_loss_mlp": 0.01027502, "balance_loss_clip": 1.04395831, "balance_loss_mlp": 1.02012885, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 2.4136016985198823, "language_loss": 0.74212861, "learning_rate": 1.7366952211304274e-06, "loss": 0.76381671, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.54645037651062 }, { "auxiliary_loss_clip": 0.01134149, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.04438722, "balance_loss_mlp": 1.02743745, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.0203340606531617, "language_loss": 0.83093345, "learning_rate": 1.735923049687139e-06, "loss": 0.85263014, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 2.492771863937378 }, { "auxiliary_loss_clip": 0.01137106, "auxiliary_loss_mlp": 0.01026561, "balance_loss_clip": 1.04442549, "balance_loss_mlp": 1.01976657, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.5482846079620536, "language_loss": 0.73926288, "learning_rate": 1.7351509183041144e-06, "loss": 0.76089954, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.728402614593506 }, { "auxiliary_loss_clip": 0.01174557, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.05316412, "balance_loss_mlp": 1.02815592, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 1.726591346670488, "language_loss": 0.71410543, "learning_rate": 1.7343788270984852e-06, "loss": 0.73621237, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.4910695552825928 }, { "auxiliary_loss_clip": 0.0114347, "auxiliary_loss_mlp": 0.01034033, "balance_loss_clip": 1.05069256, "balance_loss_mlp": 1.02619171, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 1.7955759704205314, "language_loss": 0.74680638, "learning_rate": 1.7336067761873764e-06, "loss": 0.76858133, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.667536735534668 }, { "auxiliary_loss_clip": 0.01160114, "auxiliary_loss_mlp": 0.01031012, "balance_loss_clip": 1.04764056, "balance_loss_mlp": 1.02331161, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 2.151416586060699, "language_loss": 0.76120073, "learning_rate": 1.7328347656879076e-06, "loss": 0.78311199, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.515690565109253 }, { "auxiliary_loss_clip": 0.01126618, "auxiliary_loss_mlp": 0.01027623, "balance_loss_clip": 1.04442513, "balance_loss_mlp": 1.02022576, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 2.2245665836225625, "language_loss": 0.68404913, "learning_rate": 1.7320627957171927e-06, "loss": 0.70559156, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.5076446533203125 }, { "auxiliary_loss_clip": 0.01171391, "auxiliary_loss_mlp": 0.01025021, "balance_loss_clip": 1.05173492, "balance_loss_mlp": 1.01766586, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 1.7752222058617542, "language_loss": 0.81730032, "learning_rate": 1.7312908663923382e-06, "loss": 0.83926451, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.478086233139038 }, { "auxiliary_loss_clip": 0.01151248, "auxiliary_loss_mlp": 0.01024616, "balance_loss_clip": 1.04657269, "balance_loss_mlp": 1.0169208, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 1.952799282367499, "language_loss": 0.67403734, "learning_rate": 1.7305189778304463e-06, "loss": 0.69579601, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 2.4783334732055664 }, { "auxiliary_loss_clip": 0.01147153, "auxiliary_loss_mlp": 0.01026958, "balance_loss_clip": 1.0527972, "balance_loss_mlp": 1.01954913, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 1.9646916834745636, "language_loss": 0.79886448, "learning_rate": 1.729747130148611e-06, "loss": 0.82060564, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.507913112640381 }, { "auxiliary_loss_clip": 0.01138027, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.04885781, "balance_loss_mlp": 1.02249801, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 2.660200901257454, "language_loss": 0.77180135, "learning_rate": 1.7289753234639208e-06, "loss": 0.79348612, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 3.3080809116363525 }, { "auxiliary_loss_clip": 0.01163526, "auxiliary_loss_mlp": 0.01027466, "balance_loss_clip": 1.05087543, "balance_loss_mlp": 1.01998532, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 2.0870008309430608, "language_loss": 0.76528943, "learning_rate": 1.7282035578934592e-06, "loss": 0.78719938, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 2.475905418395996 }, { "auxiliary_loss_clip": 0.01137961, "auxiliary_loss_mlp": 0.01028912, "balance_loss_clip": 1.05147576, "balance_loss_mlp": 1.02136946, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 1.6126106501923507, "language_loss": 0.79059958, "learning_rate": 1.727431833554301e-06, "loss": 0.81226838, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.4992637634277344 }, { "auxiliary_loss_clip": 0.0110609, "auxiliary_loss_mlp": 0.01023592, "balance_loss_clip": 1.04290271, "balance_loss_mlp": 1.01633215, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 1.8051671649210035, "language_loss": 0.77462441, "learning_rate": 1.7266601505635175e-06, "loss": 0.79592121, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 2.5944924354553223 }, { "auxiliary_loss_clip": 0.0115741, "auxiliary_loss_mlp": 0.01029751, "balance_loss_clip": 1.05051625, "balance_loss_mlp": 1.02156091, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 1.86951939446058, "language_loss": 0.75730181, "learning_rate": 1.7258885090381717e-06, "loss": 0.77917349, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.5282914638519287 }, { "auxiliary_loss_clip": 0.01145339, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.04603732, "balance_loss_mlp": 1.02178311, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 1.7286884013719412, "language_loss": 0.7826668, "learning_rate": 1.7251169090953213e-06, "loss": 0.80441272, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 2.5945987701416016 }, { "auxiliary_loss_clip": 0.01159705, "auxiliary_loss_mlp": 0.01027374, "balance_loss_clip": 1.04970825, "balance_loss_mlp": 1.01973248, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 2.4298655800550066, "language_loss": 0.76497126, "learning_rate": 1.7243453508520168e-06, "loss": 0.78684205, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 3.2878684997558594 }, { "auxiliary_loss_clip": 0.01141295, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.04538846, "balance_loss_mlp": 1.01904118, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 2.0102914020847393, "language_loss": 0.84473485, "learning_rate": 1.7235738344253038e-06, "loss": 0.86641061, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.490902900695801 }, { "auxiliary_loss_clip": 0.01153572, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.0488323, "balance_loss_mlp": 1.02585721, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 1.8036252211584771, "language_loss": 0.82696807, "learning_rate": 1.72280235993222e-06, "loss": 0.84884298, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 3.307020902633667 }, { "auxiliary_loss_clip": 0.01154933, "auxiliary_loss_mlp": 0.0076236, "balance_loss_clip": 1.0498383, "balance_loss_mlp": 1.00022531, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 2.012351276166846, "language_loss": 0.69446814, "learning_rate": 1.722030927489798e-06, "loss": 0.71364105, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 3.2306628227233887 }, { "auxiliary_loss_clip": 0.01133076, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.04918349, "balance_loss_mlp": 1.0192194, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 1.7136689339715356, "language_loss": 0.74278772, "learning_rate": 1.7212595372150634e-06, "loss": 0.76439238, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.5891661643981934 }, { "auxiliary_loss_clip": 0.01171721, "auxiliary_loss_mlp": 0.01028491, "balance_loss_clip": 1.05138588, "balance_loss_mlp": 1.02092671, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 2.5235358787036697, "language_loss": 0.72913927, "learning_rate": 1.720488189225035e-06, "loss": 0.75114143, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.439209461212158 }, { "auxiliary_loss_clip": 0.01161729, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.05036283, "balance_loss_mlp": 1.01920843, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 3.011949043129492, "language_loss": 0.79173553, "learning_rate": 1.7197168836367265e-06, "loss": 0.81362301, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.476140022277832 }, { "auxiliary_loss_clip": 0.01156935, "auxiliary_loss_mlp": 0.00761311, "balance_loss_clip": 1.04950452, "balance_loss_mlp": 1.00020027, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 1.890025443949501, "language_loss": 0.81859171, "learning_rate": 1.7189456205671433e-06, "loss": 0.83777416, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.4948794841766357 }, { "auxiliary_loss_clip": 0.01162683, "auxiliary_loss_mlp": 0.01027867, "balance_loss_clip": 1.05040622, "balance_loss_mlp": 1.02051759, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 1.9528033457532326, "language_loss": 0.82643896, "learning_rate": 1.7181744001332866e-06, "loss": 0.84834445, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.480687141418457 }, { "auxiliary_loss_clip": 0.01170579, "auxiliary_loss_mlp": 0.01030355, "balance_loss_clip": 1.0527662, "balance_loss_mlp": 1.02317238, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 2.100016970917802, "language_loss": 0.63202649, "learning_rate": 1.7174032224521493e-06, "loss": 0.65403581, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.4768612384796143 }, { "auxiliary_loss_clip": 0.01157009, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 1.04927123, "balance_loss_mlp": 1.01818824, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 1.6372271747489948, "language_loss": 0.69833201, "learning_rate": 1.7166320876407184e-06, "loss": 0.72015762, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.4930996894836426 }, { "auxiliary_loss_clip": 0.01173969, "auxiliary_loss_mlp": 0.00761918, "balance_loss_clip": 1.05153513, "balance_loss_mlp": 1.00022447, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 1.9496353978249683, "language_loss": 0.67954135, "learning_rate": 1.7158609958159742e-06, "loss": 0.69890022, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.442575693130493 }, { "auxiliary_loss_clip": 0.01106812, "auxiliary_loss_mlp": 0.0103034, "balance_loss_clip": 1.04320431, "balance_loss_mlp": 1.02283001, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 37.40346569455095, "language_loss": 0.78086346, "learning_rate": 1.7150899470948911e-06, "loss": 0.80223501, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.5857198238372803 }, { "auxiliary_loss_clip": 0.01043183, "auxiliary_loss_mlp": 0.01003124, "balance_loss_clip": 1.01499832, "balance_loss_mlp": 1.00198579, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.8264820647344485, "language_loss": 0.56631988, "learning_rate": 1.7143189415944365e-06, "loss": 0.58678299, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.1711440086364746 }, { "auxiliary_loss_clip": 0.01159748, "auxiliary_loss_mlp": 0.01029028, "balance_loss_clip": 1.05241942, "balance_loss_mlp": 1.0213449, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 1.580686038357926, "language_loss": 0.76416612, "learning_rate": 1.7135479794315714e-06, "loss": 0.78605384, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.4965269565582275 }, { "auxiliary_loss_clip": 0.01128198, "auxiliary_loss_mlp": 0.01028664, "balance_loss_clip": 1.04696703, "balance_loss_mlp": 1.02116561, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 1.879331303285908, "language_loss": 0.79193711, "learning_rate": 1.7127770607232502e-06, "loss": 0.81350571, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.5659799575805664 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.04701543, "balance_loss_mlp": 1.02122068, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 1.7552983604592889, "language_loss": 0.79951197, "learning_rate": 1.7120061855864204e-06, "loss": 0.82115602, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.577087163925171 }, { "auxiliary_loss_clip": 0.01159114, "auxiliary_loss_mlp": 0.0102742, "balance_loss_clip": 1.05184495, "balance_loss_mlp": 1.01936781, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 17.660739840368322, "language_loss": 0.70764959, "learning_rate": 1.7112353541380233e-06, "loss": 0.72951496, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.552063465118408 }, { "auxiliary_loss_clip": 0.01146349, "auxiliary_loss_mlp": 0.01037349, "balance_loss_clip": 1.05060923, "balance_loss_mlp": 1.02933216, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.404532588466725, "language_loss": 0.71928096, "learning_rate": 1.7104645664949931e-06, "loss": 0.74111801, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.577639102935791 }, { "auxiliary_loss_clip": 0.0114464, "auxiliary_loss_mlp": 0.01027891, "balance_loss_clip": 1.04690409, "balance_loss_mlp": 1.02058315, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 2.752699181863947, "language_loss": 0.71507049, "learning_rate": 1.7096938227742584e-06, "loss": 0.73679578, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.543750762939453 }, { "auxiliary_loss_clip": 0.01170865, "auxiliary_loss_mlp": 0.01027258, "balance_loss_clip": 1.05113864, "balance_loss_mlp": 1.01953018, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 2.193619622718537, "language_loss": 0.83700728, "learning_rate": 1.70892312309274e-06, "loss": 0.85898852, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 3.306978464126587 }, { "auxiliary_loss_clip": 0.01140039, "auxiliary_loss_mlp": 0.01025892, "balance_loss_clip": 1.04160523, "balance_loss_mlp": 1.01808417, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.142144520186431, "language_loss": 0.67997527, "learning_rate": 1.7081524675673523e-06, "loss": 0.70163465, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.504667282104492 }, { "auxiliary_loss_clip": 0.0104361, "auxiliary_loss_mlp": 0.01003097, "balance_loss_clip": 1.01433587, "balance_loss_mlp": 1.00202394, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7695833117575557, "language_loss": 0.5958541, "learning_rate": 1.7073818563150026e-06, "loss": 0.61632115, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 3.2082507610321045 }, { "auxiliary_loss_clip": 0.0115285, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.04777515, "balance_loss_mlp": 1.023844, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 2.059104674068944, "language_loss": 0.86711752, "learning_rate": 1.7066112894525935e-06, "loss": 0.88896549, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.4934444427490234 }, { "auxiliary_loss_clip": 0.01134284, "auxiliary_loss_mlp": 0.01033932, "balance_loss_clip": 1.04505026, "balance_loss_mlp": 1.02596331, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 1.5892462704764865, "language_loss": 0.7273941, "learning_rate": 1.7058407670970177e-06, "loss": 0.74907625, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 2.5945751667022705 }, { "auxiliary_loss_clip": 0.01159903, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.04984999, "balance_loss_mlp": 1.02537966, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 2.0886543267470707, "language_loss": 0.61353195, "learning_rate": 1.7050702893651643e-06, "loss": 0.63545966, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 2.4823267459869385 }, { "auxiliary_loss_clip": 0.01157207, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.0493108, "balance_loss_mlp": 1.02001786, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 2.1151450885914937, "language_loss": 0.75454581, "learning_rate": 1.7042998563739134e-06, "loss": 0.77639782, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 3.4269039630889893 }, { "auxiliary_loss_clip": 0.01152846, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.04965675, "balance_loss_mlp": 1.02622271, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 2.2004014635911253, "language_loss": 0.71598631, "learning_rate": 1.703529468240139e-06, "loss": 0.73786449, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 2.58793044090271 }, { "auxiliary_loss_clip": 0.01138424, "auxiliary_loss_mlp": 0.0102813, "balance_loss_clip": 1.04778934, "balance_loss_mlp": 1.02105474, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 2.1878507623987016, "language_loss": 0.73893267, "learning_rate": 1.7027591250807088e-06, "loss": 0.76059824, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 3.3178677558898926 }, { "auxiliary_loss_clip": 0.01173192, "auxiliary_loss_mlp": 0.0102744, "balance_loss_clip": 1.05208659, "balance_loss_mlp": 1.01908898, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 6.32096757296884, "language_loss": 0.85064149, "learning_rate": 1.7019888270124825e-06, "loss": 0.87264776, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 3.1835391521453857 }, { "auxiliary_loss_clip": 0.01162043, "auxiliary_loss_mlp": 0.01037877, "balance_loss_clip": 1.05240941, "balance_loss_mlp": 1.02993798, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 1.9448245754696594, "language_loss": 0.82294428, "learning_rate": 1.7012185741523147e-06, "loss": 0.84494352, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.4556496143341064 }, { "auxiliary_loss_clip": 0.01174698, "auxiliary_loss_mlp": 0.01027596, "balance_loss_clip": 1.05449259, "balance_loss_mlp": 1.01943636, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 1.925699945872736, "language_loss": 0.62518185, "learning_rate": 1.7004483666170514e-06, "loss": 0.64720476, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.5214765071868896 }, { "auxiliary_loss_clip": 0.01156078, "auxiliary_loss_mlp": 0.01025466, "balance_loss_clip": 1.04987502, "balance_loss_mlp": 1.01837254, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 2.369796405635374, "language_loss": 0.80598509, "learning_rate": 1.699678204523533e-06, "loss": 0.82780051, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.517056465148926 }, { "auxiliary_loss_clip": 0.01150588, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.05242991, "balance_loss_mlp": 1.02567315, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 2.6157707464290207, "language_loss": 0.68464273, "learning_rate": 1.6989080879885918e-06, "loss": 0.70647943, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.5485873222351074 }, { "auxiliary_loss_clip": 0.0103165, "auxiliary_loss_mlp": 0.01000972, "balance_loss_clip": 1.0126853, "balance_loss_mlp": 0.9997803, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.9042835411606975, "language_loss": 0.61103189, "learning_rate": 1.6981380171290544e-06, "loss": 0.63135809, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.12965989112854 }, { "auxiliary_loss_clip": 0.01133956, "auxiliary_loss_mlp": 0.01032413, "balance_loss_clip": 1.04347491, "balance_loss_mlp": 1.02424669, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 1.8049872731158227, "language_loss": 0.74028748, "learning_rate": 1.6973679920617396e-06, "loss": 0.76195115, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.524620771408081 }, { "auxiliary_loss_clip": 0.01147641, "auxiliary_loss_mlp": 0.01025638, "balance_loss_clip": 1.05253696, "balance_loss_mlp": 1.01787126, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 4.597075982699051, "language_loss": 0.84780884, "learning_rate": 1.6965980129034603e-06, "loss": 0.86954165, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.5264580249786377 }, { "auxiliary_loss_clip": 0.01144914, "auxiliary_loss_mlp": 0.01026047, "balance_loss_clip": 1.05017066, "balance_loss_mlp": 1.01811361, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 2.6218738040127363, "language_loss": 0.76538324, "learning_rate": 1.6958280797710209e-06, "loss": 0.7870928, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.570274591445923 }, { "auxiliary_loss_clip": 0.01044453, "auxiliary_loss_mlp": 0.01002494, "balance_loss_clip": 1.0152514, "balance_loss_mlp": 1.00131404, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7097308628567911, "language_loss": 0.54808426, "learning_rate": 1.6950581927812198e-06, "loss": 0.56855369, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.0174875259399414 }, { "auxiliary_loss_clip": 0.01155221, "auxiliary_loss_mlp": 0.01026443, "balance_loss_clip": 1.04787564, "balance_loss_mlp": 1.01925445, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 2.8683365770226614, "language_loss": 0.791179, "learning_rate": 1.6942883520508486e-06, "loss": 0.81299567, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.5699169635772705 }, { "auxiliary_loss_clip": 0.01155513, "auxiliary_loss_mlp": 0.01028539, "balance_loss_clip": 1.04934645, "balance_loss_mlp": 1.02168739, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 1.9714058886543886, "language_loss": 0.77566636, "learning_rate": 1.693518557696691e-06, "loss": 0.79750687, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.483856201171875 }, { "auxiliary_loss_clip": 0.01151801, "auxiliary_loss_mlp": 0.01023861, "balance_loss_clip": 1.04555488, "balance_loss_mlp": 1.01696146, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 2.364373708544208, "language_loss": 0.8901974, "learning_rate": 1.6927488098355252e-06, "loss": 0.91195399, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.474860668182373 }, { "auxiliary_loss_clip": 0.01024063, "auxiliary_loss_mlp": 0.01003328, "balance_loss_clip": 1.01409245, "balance_loss_mlp": 1.00210595, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.9166413937123754, "language_loss": 0.63206744, "learning_rate": 1.6919791085841201e-06, "loss": 0.65234125, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.1317784786224365 }, { "auxiliary_loss_clip": 0.01150049, "auxiliary_loss_mlp": 0.01027897, "balance_loss_clip": 1.04510307, "balance_loss_mlp": 1.02008843, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 2.0970192500471576, "language_loss": 0.78833318, "learning_rate": 1.6912094540592396e-06, "loss": 0.8101126, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.444520950317383 }, { "auxiliary_loss_clip": 0.01157512, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.05010509, "balance_loss_mlp": 1.02222276, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 2.50336754948972, "language_loss": 0.80706966, "learning_rate": 1.6904398463776393e-06, "loss": 0.82894015, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.4443960189819336 }, { "auxiliary_loss_clip": 0.01157223, "auxiliary_loss_mlp": 0.0102169, "balance_loss_clip": 1.04754293, "balance_loss_mlp": 1.01454926, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 1.7976092527850882, "language_loss": 0.72807276, "learning_rate": 1.6896702856560683e-06, "loss": 0.7498619, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 2.4879748821258545 }, { "auxiliary_loss_clip": 0.01125995, "auxiliary_loss_mlp": 0.01027316, "balance_loss_clip": 1.04219079, "balance_loss_mlp": 1.01965714, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 3.232454365918642, "language_loss": 0.69312018, "learning_rate": 1.6889007720112677e-06, "loss": 0.71465331, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 3.264641046524048 }, { "auxiliary_loss_clip": 0.01159789, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 1.04926109, "balance_loss_mlp": 1.01674628, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 1.6374168387131542, "language_loss": 0.7725004, "learning_rate": 1.6881313055599734e-06, "loss": 0.79433709, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.4857230186462402 }, { "auxiliary_loss_clip": 0.01128424, "auxiliary_loss_mlp": 0.01030252, "balance_loss_clip": 1.04269671, "balance_loss_mlp": 1.02269399, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 6.0398457423833465, "language_loss": 0.82155532, "learning_rate": 1.6873618864189117e-06, "loss": 0.84314209, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 2.50394606590271 }, { "auxiliary_loss_clip": 0.01157614, "auxiliary_loss_mlp": 0.01031941, "balance_loss_clip": 1.04932213, "balance_loss_mlp": 1.02451384, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 2.123408979510677, "language_loss": 0.78083766, "learning_rate": 1.686592514704803e-06, "loss": 0.80273318, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.488701343536377 }, { "auxiliary_loss_clip": 0.0114174, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.04765558, "balance_loss_mlp": 1.01969874, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 2.17357243903105, "language_loss": 0.70959914, "learning_rate": 1.685823190534361e-06, "loss": 0.73128468, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.5023059844970703 }, { "auxiliary_loss_clip": 0.01174388, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.05077767, "balance_loss_mlp": 1.02483082, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 1.745904522486701, "language_loss": 0.83401626, "learning_rate": 1.6850539140242907e-06, "loss": 0.85609341, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 2.4580090045928955 }, { "auxiliary_loss_clip": 0.01159014, "auxiliary_loss_mlp": 0.01024694, "balance_loss_clip": 1.04966068, "balance_loss_mlp": 1.01746416, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 1.7265505873506168, "language_loss": 0.81714904, "learning_rate": 1.684284685291292e-06, "loss": 0.83898616, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 3.2913572788238525 }, { "auxiliary_loss_clip": 0.01171071, "auxiliary_loss_mlp": 0.01029563, "balance_loss_clip": 1.05064762, "balance_loss_mlp": 1.02165985, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 2.2072397370150703, "language_loss": 0.81001627, "learning_rate": 1.683515504452055e-06, "loss": 0.83202261, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 2.479348659515381 }, { "auxiliary_loss_clip": 0.01120492, "auxiliary_loss_mlp": 0.0102963, "balance_loss_clip": 1.04355288, "balance_loss_mlp": 1.02139294, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 1.6090901078187176, "language_loss": 0.66300619, "learning_rate": 1.6827463716232648e-06, "loss": 0.68450743, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 3.3811419010162354 }, { "auxiliary_loss_clip": 0.01157977, "auxiliary_loss_mlp": 0.00761356, "balance_loss_clip": 1.04914451, "balance_loss_mlp": 1.00019836, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 1.7674524489475714, "language_loss": 0.75636649, "learning_rate": 1.6819772869215972e-06, "loss": 0.77555978, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 3.241947650909424 }, { "auxiliary_loss_clip": 0.01148239, "auxiliary_loss_mlp": 0.01026747, "balance_loss_clip": 1.04876781, "balance_loss_mlp": 1.02001429, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 2.3893878759584766, "language_loss": 0.8230468, "learning_rate": 1.6812082504637228e-06, "loss": 0.84479666, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.5318498611450195 }, { "auxiliary_loss_clip": 0.01158178, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.05071712, "balance_loss_mlp": 1.022192, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 1.4424786694870926, "language_loss": 0.74329853, "learning_rate": 1.6804392623663025e-06, "loss": 0.76517475, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 2.566643238067627 }, { "auxiliary_loss_clip": 0.01154172, "auxiliary_loss_mlp": 0.01027242, "balance_loss_clip": 1.04903102, "balance_loss_mlp": 1.02013755, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 1.7244124622414447, "language_loss": 0.78325689, "learning_rate": 1.6796703227459935e-06, "loss": 0.805071, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.52933669090271 }, { "auxiliary_loss_clip": 0.01106543, "auxiliary_loss_mlp": 0.01023202, "balance_loss_clip": 1.04043758, "balance_loss_mlp": 1.01587319, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 1.8135515673988654, "language_loss": 0.76151657, "learning_rate": 1.6789014317194407e-06, "loss": 0.78281403, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.7067673206329346 }, { "auxiliary_loss_clip": 0.01148271, "auxiliary_loss_mlp": 0.01026599, "balance_loss_clip": 1.04999113, "balance_loss_mlp": 1.01883268, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.066946898103636, "language_loss": 0.73208487, "learning_rate": 1.6781325894032853e-06, "loss": 0.75383353, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.5440471172332764 }, { "auxiliary_loss_clip": 0.01141142, "auxiliary_loss_mlp": 0.01028384, "balance_loss_clip": 1.04975247, "balance_loss_mlp": 1.02054, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 2.025519306826211, "language_loss": 0.91689956, "learning_rate": 1.6773637959141608e-06, "loss": 0.93859482, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.4736459255218506 }, { "auxiliary_loss_clip": 0.01134847, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.04553103, "balance_loss_mlp": 1.02065825, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 2.249312101779733, "language_loss": 0.66484773, "learning_rate": 1.6765950513686915e-06, "loss": 0.68647665, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.498988389968872 }, { "auxiliary_loss_clip": 0.0111461, "auxiliary_loss_mlp": 0.01029783, "balance_loss_clip": 1.0423075, "balance_loss_mlp": 1.0219276, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 1.655723079455022, "language_loss": 0.76202077, "learning_rate": 1.675826355883496e-06, "loss": 0.78346467, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.6138060092926025 }, { "auxiliary_loss_clip": 0.01141891, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.05114532, "balance_loss_mlp": 1.01929498, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 1.954112751204864, "language_loss": 0.79088557, "learning_rate": 1.6750577095751848e-06, "loss": 0.81257814, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.5127274990081787 }, { "auxiliary_loss_clip": 0.01170763, "auxiliary_loss_mlp": 0.01025843, "balance_loss_clip": 1.05073249, "balance_loss_mlp": 1.01848221, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.620227068573036, "language_loss": 0.72638381, "learning_rate": 1.6742891125603605e-06, "loss": 0.74834985, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.4916887283325195 }, { "auxiliary_loss_clip": 0.01158318, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.05114758, "balance_loss_mlp": 1.02666068, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 4.593682490544995, "language_loss": 0.71616763, "learning_rate": 1.6735205649556185e-06, "loss": 0.73810059, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.533649206161499 }, { "auxiliary_loss_clip": 0.01133596, "auxiliary_loss_mlp": 0.01032616, "balance_loss_clip": 1.0465107, "balance_loss_mlp": 1.02480805, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 1.644535686397196, "language_loss": 0.84582996, "learning_rate": 1.6727520668775476e-06, "loss": 0.86749202, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.571937084197998 }, { "auxiliary_loss_clip": 0.01172297, "auxiliary_loss_mlp": 0.01024949, "balance_loss_clip": 1.04983711, "balance_loss_mlp": 1.01759994, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 1.546911801957483, "language_loss": 0.75227195, "learning_rate": 1.6719836184427275e-06, "loss": 0.77424443, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.4802730083465576 }, { "auxiliary_loss_clip": 0.01141365, "auxiliary_loss_mlp": 0.01025931, "balance_loss_clip": 1.04625916, "balance_loss_mlp": 1.018296, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 1.630792929390597, "language_loss": 0.64228326, "learning_rate": 1.671215219767733e-06, "loss": 0.66395622, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.573749303817749 }, { "auxiliary_loss_clip": 0.01114366, "auxiliary_loss_mlp": 0.01027437, "balance_loss_clip": 1.0432303, "balance_loss_mlp": 1.02032065, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 1.9992210324585, "language_loss": 0.76020181, "learning_rate": 1.670446870969127e-06, "loss": 0.78161991, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.563624858856201 }, { "auxiliary_loss_clip": 0.01144444, "auxiliary_loss_mlp": 0.01031919, "balance_loss_clip": 1.04908228, "balance_loss_mlp": 1.0242722, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 2.1230560313472338, "language_loss": 0.79611075, "learning_rate": 1.6696785721634685e-06, "loss": 0.81787443, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 3.206770896911621 }, { "auxiliary_loss_clip": 0.01159654, "auxiliary_loss_mlp": 0.01027073, "balance_loss_clip": 1.0497359, "balance_loss_mlp": 1.01961601, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 1.811617341418638, "language_loss": 0.73536932, "learning_rate": 1.6689103234673086e-06, "loss": 0.7572366, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.4857959747314453 }, { "auxiliary_loss_clip": 0.01143083, "auxiliary_loss_mlp": 0.01026517, "balance_loss_clip": 1.05028582, "balance_loss_mlp": 1.01885736, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 1.7944437781282772, "language_loss": 0.76717889, "learning_rate": 1.668142124997189e-06, "loss": 0.78887486, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.558715343475342 }, { "auxiliary_loss_clip": 0.01034967, "auxiliary_loss_mlp": 0.0100403, "balance_loss_clip": 1.01378369, "balance_loss_mlp": 1.00290382, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7239260722621347, "language_loss": 0.59804755, "learning_rate": 1.6673739768696453e-06, "loss": 0.61843753, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 3.140430450439453 }, { "auxiliary_loss_clip": 0.0114787, "auxiliary_loss_mlp": 0.01036949, "balance_loss_clip": 1.04599798, "balance_loss_mlp": 1.02926016, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 1.8939920823505374, "language_loss": 0.77390194, "learning_rate": 1.6666058792012052e-06, "loss": 0.79575014, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 2.5649683475494385 }, { "auxiliary_loss_clip": 0.01053424, "auxiliary_loss_mlp": 0.0100361, "balance_loss_clip": 1.01324797, "balance_loss_mlp": 1.00252569, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 0.875434634574743, "language_loss": 0.68785757, "learning_rate": 1.6658378321083878e-06, "loss": 0.70842791, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.1468007564544678 }, { "auxiliary_loss_clip": 0.0110213, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.04425716, "balance_loss_mlp": 1.02826464, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 1.713422510168844, "language_loss": 0.8225348, "learning_rate": 1.6650698357077055e-06, "loss": 0.84390938, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 3.3826870918273926 }, { "auxiliary_loss_clip": 0.01143608, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.04383314, "balance_loss_mlp": 1.02036858, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 2.949168219420587, "language_loss": 0.80940127, "learning_rate": 1.6643018901156632e-06, "loss": 0.8311193, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 2.4970686435699463 }, { "auxiliary_loss_clip": 0.01144954, "auxiliary_loss_mlp": 0.01026346, "balance_loss_clip": 1.04538059, "balance_loss_mlp": 1.01909804, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 2.2987399059017104, "language_loss": 0.79304308, "learning_rate": 1.6635339954487566e-06, "loss": 0.81475604, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 2.509399175643921 }, { "auxiliary_loss_clip": 0.01146553, "auxiliary_loss_mlp": 0.01027388, "balance_loss_clip": 1.04700041, "balance_loss_mlp": 1.01988387, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 1.815391248719157, "language_loss": 0.81782675, "learning_rate": 1.6627661518234765e-06, "loss": 0.83956623, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 3.334368944168091 }, { "auxiliary_loss_clip": 0.01118393, "auxiliary_loss_mlp": 0.01025842, "balance_loss_clip": 1.0461247, "balance_loss_mlp": 1.01849902, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 1.594474268232112, "language_loss": 0.85534954, "learning_rate": 1.661998359356302e-06, "loss": 0.87679183, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 3.3555374145507812 }, { "auxiliary_loss_clip": 0.01062342, "auxiliary_loss_mlp": 0.01002761, "balance_loss_clip": 1.01360273, "balance_loss_mlp": 1.00169981, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.7474482527793633, "language_loss": 0.55889952, "learning_rate": 1.6612306181637077e-06, "loss": 0.5795505, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 3.0358452796936035 }, { "auxiliary_loss_clip": 0.01128805, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.04492402, "balance_loss_mlp": 1.02355814, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 1.9787976402038354, "language_loss": 0.65674096, "learning_rate": 1.6604629283621598e-06, "loss": 0.67833853, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 2.5398221015930176 }, { "auxiliary_loss_clip": 0.01173405, "auxiliary_loss_mlp": 0.01030746, "balance_loss_clip": 1.04989457, "balance_loss_mlp": 1.02286077, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 1.8461646883160006, "language_loss": 0.74044049, "learning_rate": 1.6596952900681152e-06, "loss": 0.76248205, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.565617322921753 }, { "auxiliary_loss_clip": 0.01104007, "auxiliary_loss_mlp": 0.01030994, "balance_loss_clip": 1.04480565, "balance_loss_mlp": 1.02266121, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 2.135650447826687, "language_loss": 0.8220762, "learning_rate": 1.658927703398025e-06, "loss": 0.84342623, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.6109657287597656 }, { "auxiliary_loss_clip": 0.01116641, "auxiliary_loss_mlp": 0.01023953, "balance_loss_clip": 1.0407021, "balance_loss_mlp": 1.01636267, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.2225134011602443, "language_loss": 0.77526283, "learning_rate": 1.6581601684683309e-06, "loss": 0.79666883, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.619114637374878 }, { "auxiliary_loss_clip": 0.01160505, "auxiliary_loss_mlp": 0.01028972, "balance_loss_clip": 1.05162573, "balance_loss_mlp": 1.02097249, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 2.2800728144588525, "language_loss": 0.69007611, "learning_rate": 1.6573926853954674e-06, "loss": 0.71197087, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.49540638923645 }, { "auxiliary_loss_clip": 0.01138347, "auxiliary_loss_mlp": 0.01029286, "balance_loss_clip": 1.04508293, "balance_loss_mlp": 1.02208018, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 1.850848854559763, "language_loss": 0.83151168, "learning_rate": 1.6566252542958608e-06, "loss": 0.85318804, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.5470423698425293 }, { "auxiliary_loss_clip": 0.01125061, "auxiliary_loss_mlp": 0.01033273, "balance_loss_clip": 1.04676056, "balance_loss_mlp": 1.02551234, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 2.063937981027075, "language_loss": 0.78575444, "learning_rate": 1.6558578752859305e-06, "loss": 0.8073377, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.613264799118042 }, { "auxiliary_loss_clip": 0.01127821, "auxiliary_loss_mlp": 0.01024756, "balance_loss_clip": 1.04459763, "balance_loss_mlp": 1.01716876, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 2.0132146713521912, "language_loss": 0.78609711, "learning_rate": 1.6550905484820865e-06, "loss": 0.80762285, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.5841875076293945 }, { "auxiliary_loss_clip": 0.011704, "auxiliary_loss_mlp": 0.01029724, "balance_loss_clip": 1.04892254, "balance_loss_mlp": 1.02149272, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.31013763200404, "language_loss": 0.78846002, "learning_rate": 1.6543232740007328e-06, "loss": 0.81046128, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.4803056716918945 }, { "auxiliary_loss_clip": 0.01159068, "auxiliary_loss_mlp": 0.0102581, "balance_loss_clip": 1.04888821, "balance_loss_mlp": 1.0180881, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.141097078508931, "language_loss": 0.66942346, "learning_rate": 1.653556051958263e-06, "loss": 0.69127226, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.5520994663238525 }, { "auxiliary_loss_clip": 0.01084782, "auxiliary_loss_mlp": 0.01026626, "balance_loss_clip": 1.04078555, "balance_loss_mlp": 1.01953578, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 1.8917559384178562, "language_loss": 0.73782313, "learning_rate": 1.6527888824710642e-06, "loss": 0.75893724, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.64235258102417 }, { "auxiliary_loss_clip": 0.01122456, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.04186225, "balance_loss_mlp": 1.0203073, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 1.9935340461320874, "language_loss": 0.76443493, "learning_rate": 1.6520217656555166e-06, "loss": 0.78594154, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.600533962249756 }, { "auxiliary_loss_clip": 0.01131997, "auxiliary_loss_mlp": 0.01029244, "balance_loss_clip": 1.04349184, "balance_loss_mlp": 1.02184987, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 1.5751842152210997, "language_loss": 0.70677924, "learning_rate": 1.65125470162799e-06, "loss": 0.72839165, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.5750174522399902 }, { "auxiliary_loss_clip": 0.01130985, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.04427886, "balance_loss_mlp": 1.02093482, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 2.2551627821073725, "language_loss": 0.70095229, "learning_rate": 1.6504876905048485e-06, "loss": 0.72255564, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.531982421875 }, { "auxiliary_loss_clip": 0.01171416, "auxiliary_loss_mlp": 0.01029621, "balance_loss_clip": 1.0508697, "balance_loss_mlp": 1.02267122, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 1.5176183120504734, "language_loss": 0.72159135, "learning_rate": 1.6497207324024464e-06, "loss": 0.74360168, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.465909481048584 }, { "auxiliary_loss_clip": 0.01148003, "auxiliary_loss_mlp": 0.01026684, "balance_loss_clip": 1.04666221, "balance_loss_mlp": 1.01883125, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 1.8092032623703358, "language_loss": 0.82208848, "learning_rate": 1.6489538274371305e-06, "loss": 0.84383529, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 3.329573392868042 }, { "auxiliary_loss_clip": 0.0115354, "auxiliary_loss_mlp": 0.01025248, "balance_loss_clip": 1.05006933, "balance_loss_mlp": 1.0181905, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 1.9254229382147208, "language_loss": 0.83206773, "learning_rate": 1.6481869757252396e-06, "loss": 0.85385561, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.4985244274139404 }, { "auxiliary_loss_clip": 0.01158208, "auxiliary_loss_mlp": 0.01024939, "balance_loss_clip": 1.04970181, "balance_loss_mlp": 1.0174706, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.6906625988672503, "language_loss": 0.71727645, "learning_rate": 1.647420177383105e-06, "loss": 0.73910797, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 2.573963165283203 }, { "auxiliary_loss_clip": 0.01156697, "auxiliary_loss_mlp": 0.01031471, "balance_loss_clip": 1.05189919, "balance_loss_mlp": 1.02422297, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 2.448566560510273, "language_loss": 0.72625095, "learning_rate": 1.646653432527049e-06, "loss": 0.74813259, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.561300754547119 }, { "auxiliary_loss_clip": 0.01134758, "auxiliary_loss_mlp": 0.01026975, "balance_loss_clip": 1.04905844, "balance_loss_mlp": 1.01972091, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.4404881304372572, "language_loss": 0.74263299, "learning_rate": 1.645886741273387e-06, "loss": 0.76425028, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 2.6033706665039062 }, { "auxiliary_loss_clip": 0.01124602, "auxiliary_loss_mlp": 0.01027407, "balance_loss_clip": 1.04941273, "balance_loss_mlp": 1.01967013, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 2.0936335757412086, "language_loss": 0.74031961, "learning_rate": 1.645120103738424e-06, "loss": 0.76183963, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 3.3269333839416504 }, { "auxiliary_loss_clip": 0.01148442, "auxiliary_loss_mlp": 0.0076103, "balance_loss_clip": 1.04849577, "balance_loss_mlp": 1.00021005, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 2.0110076806483197, "language_loss": 0.83219767, "learning_rate": 1.6443535200384591e-06, "loss": 0.85129237, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 2.4744019508361816 }, { "auxiliary_loss_clip": 0.01173984, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.05339146, "balance_loss_mlp": 1.02592242, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 1.6039343600038112, "language_loss": 0.70433724, "learning_rate": 1.6435869902897827e-06, "loss": 0.72640479, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 2.487847089767456 }, { "auxiliary_loss_clip": 0.0103572, "auxiliary_loss_mlp": 0.01007134, "balance_loss_clip": 1.01628327, "balance_loss_mlp": 1.00598323, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 0.7918647137310475, "language_loss": 0.62044322, "learning_rate": 1.6428205146086764e-06, "loss": 0.64087176, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.2042245864868164 }, { "auxiliary_loss_clip": 0.0114968, "auxiliary_loss_mlp": 0.01029254, "balance_loss_clip": 1.0479182, "balance_loss_mlp": 1.02178836, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 1.5074041409012287, "language_loss": 0.70648509, "learning_rate": 1.6420540931114142e-06, "loss": 0.72827446, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 4.0821545124053955 }, { "auxiliary_loss_clip": 0.01148687, "auxiliary_loss_mlp": 0.01033886, "balance_loss_clip": 1.04960227, "balance_loss_mlp": 1.02589941, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 2.0127515635864546, "language_loss": 0.79113483, "learning_rate": 1.6412877259142616e-06, "loss": 0.81296062, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 2.5455076694488525 }, { "auxiliary_loss_clip": 0.01141756, "auxiliary_loss_mlp": 0.01029052, "balance_loss_clip": 1.04783261, "balance_loss_mlp": 1.0211184, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 2.911127026230653, "language_loss": 0.73569691, "learning_rate": 1.6405214131334757e-06, "loss": 0.75740504, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.588075637817383 }, { "auxiliary_loss_clip": 0.0111761, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.0471909, "balance_loss_mlp": 1.02029657, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 1.759458085911633, "language_loss": 0.79256952, "learning_rate": 1.6397551548853052e-06, "loss": 0.814026, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 2.6519582271575928 }, { "auxiliary_loss_clip": 0.01145668, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.05089509, "balance_loss_mlp": 1.01686549, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 1.522793647113733, "language_loss": 0.71272659, "learning_rate": 1.6389889512859917e-06, "loss": 0.7344318, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.5416009426116943 }, { "auxiliary_loss_clip": 0.01043416, "auxiliary_loss_mlp": 0.01000553, "balance_loss_clip": 1.01502728, "balance_loss_mlp": 0.9994567, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 0.8118761218628681, "language_loss": 0.60390252, "learning_rate": 1.638222802451767e-06, "loss": 0.62434214, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.1377086639404297 }, { "auxiliary_loss_clip": 0.0115196, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.04867482, "balance_loss_mlp": 1.02245104, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 1.6649042227474369, "language_loss": 0.7530148, "learning_rate": 1.6374567084988561e-06, "loss": 0.77483213, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.54168701171875 }, { "auxiliary_loss_clip": 0.01150128, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.0527513, "balance_loss_mlp": 1.01900673, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 1.598190291869178, "language_loss": 0.76777369, "learning_rate": 1.6366906695434738e-06, "loss": 0.78954518, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.579773187637329 }, { "auxiliary_loss_clip": 0.0116089, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.05299926, "balance_loss_mlp": 1.02511501, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 1.781591038237632, "language_loss": 0.85552299, "learning_rate": 1.6359246857018275e-06, "loss": 0.87745285, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.4952282905578613 }, { "auxiliary_loss_clip": 0.01111512, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 1.04175806, "balance_loss_mlp": 1.01840043, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 1.8804661356654193, "language_loss": 0.78113353, "learning_rate": 1.6351587570901178e-06, "loss": 0.80250573, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.6202831268310547 }, { "auxiliary_loss_clip": 0.0113038, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.0492487, "balance_loss_mlp": 1.02137303, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 2.698012580418275, "language_loss": 0.76012558, "learning_rate": 1.634392883824534e-06, "loss": 0.781721, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.5178325176239014 }, { "auxiliary_loss_clip": 0.0111582, "auxiliary_loss_mlp": 0.01031547, "balance_loss_clip": 1.04433823, "balance_loss_mlp": 1.02371466, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 1.8204940130282183, "language_loss": 0.67399329, "learning_rate": 1.6336270660212595e-06, "loss": 0.695467, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.7197060585021973 }, { "auxiliary_loss_clip": 0.01141803, "auxiliary_loss_mlp": 0.01029469, "balance_loss_clip": 1.05158997, "balance_loss_mlp": 1.02105033, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 2.056452172171135, "language_loss": 0.65513343, "learning_rate": 1.6328613037964676e-06, "loss": 0.67684615, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.668653726577759 }, { "auxiliary_loss_clip": 0.01156821, "auxiliary_loss_mlp": 0.01028702, "balance_loss_clip": 1.0486939, "balance_loss_mlp": 1.02082849, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 1.7234097491464406, "language_loss": 0.67769587, "learning_rate": 1.6320955972663241e-06, "loss": 0.69955111, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.5086021423339844 }, { "auxiliary_loss_clip": 0.01157148, "auxiliary_loss_mlp": 0.01027811, "balance_loss_clip": 1.04796815, "balance_loss_mlp": 1.02002943, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 1.5412204856880487, "language_loss": 0.64868259, "learning_rate": 1.6313299465469857e-06, "loss": 0.67053211, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.6277527809143066 }, { "auxiliary_loss_clip": 0.01156048, "auxiliary_loss_mlp": 0.0103488, "balance_loss_clip": 1.05017495, "balance_loss_mlp": 1.02714288, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 2.686810003290098, "language_loss": 0.7934559, "learning_rate": 1.6305643517546014e-06, "loss": 0.8153652, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.480984687805176 }, { "auxiliary_loss_clip": 0.01172289, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.05287194, "balance_loss_mlp": 1.02345872, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 1.764084811537368, "language_loss": 0.84211975, "learning_rate": 1.629798813005311e-06, "loss": 0.86414737, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 3.2491726875305176 }, { "auxiliary_loss_clip": 0.01119048, "auxiliary_loss_mlp": 0.01023115, "balance_loss_clip": 1.04783571, "balance_loss_mlp": 1.01566744, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 1.9445446213251805, "language_loss": 0.70825076, "learning_rate": 1.6290333304152473e-06, "loss": 0.72967243, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 2.610839366912842 }, { "auxiliary_loss_clip": 0.01140527, "auxiliary_loss_mlp": 0.01030485, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02258182, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 1.6940130316478839, "language_loss": 0.57042205, "learning_rate": 1.6282679041005314e-06, "loss": 0.59213221, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.7282886505126953 }, { "auxiliary_loss_clip": 0.01136855, "auxiliary_loss_mlp": 0.01027128, "balance_loss_clip": 1.04588389, "balance_loss_mlp": 1.01975834, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 3.961083090736645, "language_loss": 0.87047613, "learning_rate": 1.6275025341772789e-06, "loss": 0.89211607, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 2.505280017852783 }, { "auxiliary_loss_clip": 0.01143504, "auxiliary_loss_mlp": 0.01029875, "balance_loss_clip": 1.04676807, "balance_loss_mlp": 1.02166152, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 5.618618334090883, "language_loss": 0.82081956, "learning_rate": 1.626737220761596e-06, "loss": 0.84255338, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.552645683288574 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.01036999, "balance_loss_clip": 1.05006874, "balance_loss_mlp": 1.02928042, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 1.8654010787597635, "language_loss": 0.7898705, "learning_rate": 1.62597196396958e-06, "loss": 0.81178629, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 3.280918836593628 }, { "auxiliary_loss_clip": 0.01156227, "auxiliary_loss_mlp": 0.010289, "balance_loss_clip": 1.04863846, "balance_loss_mlp": 1.02112126, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 1.8773122771276927, "language_loss": 0.8566817, "learning_rate": 1.6252067639173197e-06, "loss": 0.87853301, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.546895742416382 }, { "auxiliary_loss_clip": 0.01160066, "auxiliary_loss_mlp": 0.01028992, "balance_loss_clip": 1.04966283, "balance_loss_mlp": 1.0213927, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 1.7463057671398208, "language_loss": 0.6995312, "learning_rate": 1.6244416207208956e-06, "loss": 0.72142178, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 2.5489680767059326 }, { "auxiliary_loss_clip": 0.01129432, "auxiliary_loss_mlp": 0.01026375, "balance_loss_clip": 1.04691076, "balance_loss_mlp": 1.0186559, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 1.7217540019677244, "language_loss": 0.73395759, "learning_rate": 1.6236765344963787e-06, "loss": 0.75551569, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.6353330612182617 }, { "auxiliary_loss_clip": 0.01145087, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.04959655, "balance_loss_mlp": 1.02301884, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 2.0155849524802276, "language_loss": 0.69119173, "learning_rate": 1.6229115053598322e-06, "loss": 0.71294433, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.6697425842285156 }, { "auxiliary_loss_clip": 0.01160832, "auxiliary_loss_mlp": 0.01036931, "balance_loss_clip": 1.05424094, "balance_loss_mlp": 1.02879238, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 1.8125642122205157, "language_loss": 0.72255546, "learning_rate": 1.6221465334273108e-06, "loss": 0.74453306, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 3.292959213256836 }, { "auxiliary_loss_clip": 0.01134401, "auxiliary_loss_mlp": 0.0102348, "balance_loss_clip": 1.04693174, "balance_loss_mlp": 1.01647925, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 2.020893945632337, "language_loss": 0.61735672, "learning_rate": 1.6213816188148593e-06, "loss": 0.63893551, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.605311155319214 }, { "auxiliary_loss_clip": 0.01133305, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.04817891, "balance_loss_mlp": 1.02208745, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 1.759720230345461, "language_loss": 0.77431649, "learning_rate": 1.6206167616385162e-06, "loss": 0.79594654, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 2.5785958766937256 }, { "auxiliary_loss_clip": 0.01151543, "auxiliary_loss_mlp": 0.01028921, "balance_loss_clip": 1.05100727, "balance_loss_mlp": 1.02043962, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 2.5958423895791336, "language_loss": 0.74023098, "learning_rate": 1.6198519620143078e-06, "loss": 0.76203561, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.5263259410858154 }, { "auxiliary_loss_clip": 0.01135187, "auxiliary_loss_mlp": 0.01026265, "balance_loss_clip": 1.04890394, "balance_loss_mlp": 1.01860023, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 1.5435255837358124, "language_loss": 0.7831285, "learning_rate": 1.6190872200582546e-06, "loss": 0.80474305, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.62835955619812 }, { "auxiliary_loss_clip": 0.01140175, "auxiliary_loss_mlp": 0.00761628, "balance_loss_clip": 1.04755259, "balance_loss_mlp": 1.00020814, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 2.6474815931208657, "language_loss": 0.7825954, "learning_rate": 1.6183225358863676e-06, "loss": 0.80161345, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.5224947929382324 }, { "auxiliary_loss_clip": 0.01134854, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.04681802, "balance_loss_mlp": 1.01848793, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 4.105652291156933, "language_loss": 0.71387446, "learning_rate": 1.617557909614648e-06, "loss": 0.7354902, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.6222987174987793 }, { "auxiliary_loss_clip": 0.01127871, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 1.04680073, "balance_loss_mlp": 1.02333939, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 1.8215922031128746, "language_loss": 0.86272371, "learning_rate": 1.6167933413590899e-06, "loss": 0.88431108, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 2.5919148921966553 }, { "auxiliary_loss_clip": 0.01158553, "auxiliary_loss_mlp": 0.01025437, "balance_loss_clip": 1.05003905, "balance_loss_mlp": 1.01781368, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 2.269078951126678, "language_loss": 0.90704387, "learning_rate": 1.6160288312356773e-06, "loss": 0.92888373, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.493025541305542 }, { "auxiliary_loss_clip": 0.01162105, "auxiliary_loss_mlp": 0.01029699, "balance_loss_clip": 1.04978156, "balance_loss_mlp": 1.02209938, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 1.6131643340121986, "language_loss": 0.81732285, "learning_rate": 1.6152643793603857e-06, "loss": 0.83924091, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.5295186042785645 }, { "auxiliary_loss_clip": 0.01174452, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.0535208, "balance_loss_mlp": 1.02639556, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 1.6226286000650232, "language_loss": 0.87725425, "learning_rate": 1.6144999858491815e-06, "loss": 0.89933848, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.503038167953491 }, { "auxiliary_loss_clip": 0.01147616, "auxiliary_loss_mlp": 0.01025771, "balance_loss_clip": 1.04766583, "balance_loss_mlp": 1.01820755, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 1.5980363231149146, "language_loss": 0.8568657, "learning_rate": 1.6137356508180232e-06, "loss": 0.87859964, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.6149351596832275 }, { "auxiliary_loss_clip": 0.0117066, "auxiliary_loss_mlp": 0.00761925, "balance_loss_clip": 1.04917169, "balance_loss_mlp": 1.0002104, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 1.7299858508468873, "language_loss": 0.81587791, "learning_rate": 1.6129713743828593e-06, "loss": 0.83520377, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.485017776489258 }, { "auxiliary_loss_clip": 0.01143442, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.04565346, "balance_loss_mlp": 1.02264857, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.4838722564552809, "language_loss": 0.75472128, "learning_rate": 1.6122071566596306e-06, "loss": 0.7764582, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.5426180362701416 }, { "auxiliary_loss_clip": 0.01160104, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.05040884, "balance_loss_mlp": 1.01951718, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 6.482085405431135, "language_loss": 0.83305538, "learning_rate": 1.6114429977642674e-06, "loss": 0.85493028, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.456416130065918 }, { "auxiliary_loss_clip": 0.01158002, "auxiliary_loss_mlp": 0.01029385, "balance_loss_clip": 1.05243027, "balance_loss_mlp": 1.02186251, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 7.606676463526228, "language_loss": 0.73793435, "learning_rate": 1.6106788978126926e-06, "loss": 0.75980824, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.5046398639678955 }, { "auxiliary_loss_clip": 0.01109222, "auxiliary_loss_mlp": 0.01027306, "balance_loss_clip": 1.04114509, "balance_loss_mlp": 1.01943839, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 2.445406540617777, "language_loss": 0.78718388, "learning_rate": 1.6099148569208196e-06, "loss": 0.80854917, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 3.455181360244751 }, { "auxiliary_loss_clip": 0.0114394, "auxiliary_loss_mlp": 0.0103374, "balance_loss_clip": 1.05181098, "balance_loss_mlp": 1.02570844, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 2.1091661787231413, "language_loss": 0.62842584, "learning_rate": 1.6091508752045523e-06, "loss": 0.65020263, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.5940284729003906 }, { "auxiliary_loss_clip": 0.01122593, "auxiliary_loss_mlp": 0.01032837, "balance_loss_clip": 1.04350877, "balance_loss_mlp": 1.02476633, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 1.5457673126035152, "language_loss": 0.86420023, "learning_rate": 1.608386952779787e-06, "loss": 0.88575453, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.6013150215148926 }, { "auxiliary_loss_clip": 0.01148094, "auxiliary_loss_mlp": 0.0102869, "balance_loss_clip": 1.04843998, "balance_loss_mlp": 1.0214417, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 1.5506805590535169, "language_loss": 0.74750948, "learning_rate": 1.6076230897624098e-06, "loss": 0.76927722, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.6182539463043213 }, { "auxiliary_loss_clip": 0.01158484, "auxiliary_loss_mlp": 0.01030395, "balance_loss_clip": 1.04729056, "balance_loss_mlp": 1.02258086, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 1.9088375603250827, "language_loss": 0.7758975, "learning_rate": 1.6068592862682974e-06, "loss": 0.79778636, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 2.575953960418701 }, { "auxiliary_loss_clip": 0.01148226, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.04884803, "balance_loss_mlp": 1.02446985, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 2.204847724069221, "language_loss": 0.73596662, "learning_rate": 1.6060955424133187e-06, "loss": 0.75776345, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 2.6583011150360107 }, { "auxiliary_loss_clip": 0.01158691, "auxiliary_loss_mlp": 0.01027771, "balance_loss_clip": 1.0502764, "balance_loss_mlp": 1.02022171, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.6171293810970002, "language_loss": 0.89616859, "learning_rate": 1.6053318583133332e-06, "loss": 0.91803324, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 3.374431610107422 }, { "auxiliary_loss_clip": 0.01156572, "auxiliary_loss_mlp": 0.0102472, "balance_loss_clip": 1.04942513, "balance_loss_mlp": 1.01738226, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 2.322522423522114, "language_loss": 0.75269282, "learning_rate": 1.6045682340841907e-06, "loss": 0.77450567, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 2.539273262023926 }, { "auxiliary_loss_clip": 0.01032689, "auxiliary_loss_mlp": 0.00751221, "balance_loss_clip": 1.01359463, "balance_loss_mlp": 1.00021029, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.7508870940778991, "language_loss": 0.57995141, "learning_rate": 1.6038046698417336e-06, "loss": 0.59779054, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 3.1307826042175293 }, { "auxiliary_loss_clip": 0.01155554, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.04715753, "balance_loss_mlp": 1.01985025, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 1.8083449014192836, "language_loss": 0.68772602, "learning_rate": 1.6030411657017919e-06, "loss": 0.7095601, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.5229616165161133 }, { "auxiliary_loss_clip": 0.01148146, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.04665375, "balance_loss_mlp": 1.02414465, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.7008754443343417, "language_loss": 0.84494591, "learning_rate": 1.6022777217801903e-06, "loss": 0.86674374, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 3.297555685043335 }, { "auxiliary_loss_clip": 0.01130484, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.04904699, "balance_loss_mlp": 1.02223372, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 1.9085876191566582, "language_loss": 0.7395792, "learning_rate": 1.601514338192742e-06, "loss": 0.7611798, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.5574562549591064 }, { "auxiliary_loss_clip": 0.01165067, "auxiliary_loss_mlp": 0.01027318, "balance_loss_clip": 1.04760957, "balance_loss_mlp": 1.02040946, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.0559547223549575, "language_loss": 0.71279776, "learning_rate": 1.6007510150552514e-06, "loss": 0.73472154, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 2.484485149383545 }, { "auxiliary_loss_clip": 0.0115935, "auxiliary_loss_mlp": 0.01031114, "balance_loss_clip": 1.04719055, "balance_loss_mlp": 1.02308846, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 1.5152514541315234, "language_loss": 0.61992913, "learning_rate": 1.599987752483515e-06, "loss": 0.64183378, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.732713460922241 }, { "auxiliary_loss_clip": 0.01123285, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.04452109, "balance_loss_mlp": 1.02354646, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 1.6078098071599773, "language_loss": 0.67803788, "learning_rate": 1.5992245505933184e-06, "loss": 0.6995846, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.557635545730591 }, { "auxiliary_loss_clip": 0.01172842, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.0515095, "balance_loss_mlp": 1.02215004, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 2.210489589614168, "language_loss": 0.70919347, "learning_rate": 1.5984614095004388e-06, "loss": 0.73121929, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.5752646923065186 }, { "auxiliary_loss_clip": 0.01150268, "auxiliary_loss_mlp": 0.01025872, "balance_loss_clip": 1.04750919, "balance_loss_mlp": 1.01792026, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 2.3473591381145793, "language_loss": 0.8086977, "learning_rate": 1.5976983293206438e-06, "loss": 0.83045912, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.488098382949829 }, { "auxiliary_loss_clip": 0.0113933, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.04440355, "balance_loss_mlp": 1.01837826, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 1.9298818234721835, "language_loss": 0.71243626, "learning_rate": 1.5969353101696928e-06, "loss": 0.73408806, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.5387024879455566 }, { "auxiliary_loss_clip": 0.01156367, "auxiliary_loss_mlp": 0.0103026, "balance_loss_clip": 1.04830194, "balance_loss_mlp": 1.02350962, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 1.89261819296346, "language_loss": 0.79934031, "learning_rate": 1.5961723521633341e-06, "loss": 0.82120657, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.57186222076416 }, { "auxiliary_loss_clip": 0.01136441, "auxiliary_loss_mlp": 0.01027689, "balance_loss_clip": 1.04608333, "balance_loss_mlp": 1.02005982, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 2.3827984269904277, "language_loss": 0.90970594, "learning_rate": 1.5954094554173097e-06, "loss": 0.93134725, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.5221855640411377 }, { "auxiliary_loss_clip": 0.01148218, "auxiliary_loss_mlp": 0.01033268, "balance_loss_clip": 1.04969299, "balance_loss_mlp": 1.02640474, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 2.0704251206657345, "language_loss": 0.79254264, "learning_rate": 1.5946466200473482e-06, "loss": 0.81435752, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.5368432998657227 }, { "auxiliary_loss_clip": 0.01144792, "auxiliary_loss_mlp": 0.01026137, "balance_loss_clip": 1.04562306, "balance_loss_mlp": 1.01945543, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 1.8272160797844164, "language_loss": 0.83588547, "learning_rate": 1.5938838461691723e-06, "loss": 0.85759479, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.531243085861206 }, { "auxiliary_loss_clip": 0.01173008, "auxiliary_loss_mlp": 0.01030651, "balance_loss_clip": 1.05305696, "balance_loss_mlp": 1.02258635, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.5545174271581406, "language_loss": 0.83047283, "learning_rate": 1.593121133898494e-06, "loss": 0.85250938, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.433091878890991 }, { "auxiliary_loss_clip": 0.01165392, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.05145574, "balance_loss_mlp": 1.02232242, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 2.014364029489039, "language_loss": 0.79380512, "learning_rate": 1.592358483351016e-06, "loss": 0.81575382, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.5464024543762207 }, { "auxiliary_loss_clip": 0.01154574, "auxiliary_loss_mlp": 0.01023304, "balance_loss_clip": 1.04908109, "balance_loss_mlp": 1.01630938, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 1.8647481767233427, "language_loss": 0.72746277, "learning_rate": 1.5915958946424326e-06, "loss": 0.74924159, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.468224048614502 }, { "auxiliary_loss_clip": 0.01129735, "auxiliary_loss_mlp": 0.00761382, "balance_loss_clip": 1.04620695, "balance_loss_mlp": 1.0001837, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 1.9540912341668029, "language_loss": 0.74391639, "learning_rate": 1.5908333678884271e-06, "loss": 0.76282758, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 2.806713104248047 }, { "auxiliary_loss_clip": 0.01156702, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.05041099, "balance_loss_mlp": 1.02034843, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 1.9489988764100805, "language_loss": 0.74197894, "learning_rate": 1.5900709032046743e-06, "loss": 0.76382554, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 3.2448208332061768 }, { "auxiliary_loss_clip": 0.01143786, "auxiliary_loss_mlp": 0.01022891, "balance_loss_clip": 1.05307364, "balance_loss_mlp": 1.01608109, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 2.564992613876135, "language_loss": 0.78184879, "learning_rate": 1.5893085007068391e-06, "loss": 0.80351555, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.542616605758667 }, { "auxiliary_loss_clip": 0.01132306, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.04533434, "balance_loss_mlp": 1.01972568, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 1.9074492796256006, "language_loss": 0.70961332, "learning_rate": 1.5885461605105786e-06, "loss": 0.73120612, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.554541826248169 }, { "auxiliary_loss_clip": 0.01141717, "auxiliary_loss_mlp": 0.01030461, "balance_loss_clip": 1.04683793, "balance_loss_mlp": 1.02363038, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 2.0329790301598085, "language_loss": 0.76677567, "learning_rate": 1.5877838827315375e-06, "loss": 0.78849745, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.5364270210266113 }, { "auxiliary_loss_clip": 0.01172415, "auxiliary_loss_mlp": 0.0102824, "balance_loss_clip": 1.0522095, "balance_loss_mlp": 1.02042294, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 1.6853634445819305, "language_loss": 0.69763494, "learning_rate": 1.587021667485355e-06, "loss": 0.71964145, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 2.4639716148376465 }, { "auxiliary_loss_clip": 0.01146064, "auxiliary_loss_mlp": 0.01027758, "balance_loss_clip": 1.0469135, "balance_loss_mlp": 1.02052164, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 1.7439349154987231, "language_loss": 0.78544033, "learning_rate": 1.5862595148876559e-06, "loss": 0.8071785, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 2.5266549587249756 }, { "auxiliary_loss_clip": 0.01117661, "auxiliary_loss_mlp": 0.01030268, "balance_loss_clip": 1.04496408, "balance_loss_mlp": 1.02254057, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 2.023591617017957, "language_loss": 0.76428401, "learning_rate": 1.58549742505406e-06, "loss": 0.78576338, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 3.3362550735473633 }, { "auxiliary_loss_clip": 0.01169103, "auxiliary_loss_mlp": 0.01025329, "balance_loss_clip": 1.04916465, "balance_loss_mlp": 1.01790774, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 2.1719049137597732, "language_loss": 0.75719601, "learning_rate": 1.5847353981001747e-06, "loss": 0.77914035, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.438805103302002 }, { "auxiliary_loss_clip": 0.01135454, "auxiliary_loss_mlp": 0.01035913, "balance_loss_clip": 1.04419243, "balance_loss_mlp": 1.02860856, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 1.5077667256079406, "language_loss": 0.69952196, "learning_rate": 1.5839734341415993e-06, "loss": 0.72123563, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 2.675907611846924 }, { "auxiliary_loss_clip": 0.01151313, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.0524509, "balance_loss_mlp": 1.02097738, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 2.0868016052317797, "language_loss": 0.76419449, "learning_rate": 1.5832115332939238e-06, "loss": 0.78598416, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.509261131286621 }, { "auxiliary_loss_clip": 0.01159483, "auxiliary_loss_mlp": 0.01029529, "balance_loss_clip": 1.05097675, "balance_loss_mlp": 1.02208734, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 1.764672533787325, "language_loss": 0.74410248, "learning_rate": 1.5824496956727272e-06, "loss": 0.76599258, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 3.309842109680176 }, { "auxiliary_loss_clip": 0.01140366, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.04566479, "balance_loss_mlp": 1.01716018, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 1.7541143132296766, "language_loss": 0.72825897, "learning_rate": 1.5816879213935797e-06, "loss": 0.74990153, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 2.5237081050872803 }, { "auxiliary_loss_clip": 0.01155229, "auxiliary_loss_mlp": 0.01029107, "balance_loss_clip": 1.05122352, "balance_loss_mlp": 1.02204943, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.4357325508714724, "language_loss": 0.79430044, "learning_rate": 1.5809262105720416e-06, "loss": 0.81614375, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 2.590930223464966 }, { "auxiliary_loss_clip": 0.01170588, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.05077457, "balance_loss_mlp": 1.01974797, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.5103004928313333, "language_loss": 0.79248887, "learning_rate": 1.5801645633236644e-06, "loss": 0.81446469, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.4867396354675293 }, { "auxiliary_loss_clip": 0.01137819, "auxiliary_loss_mlp": 0.01026146, "balance_loss_clip": 1.04640448, "balance_loss_mlp": 1.0185498, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 1.7971999051742023, "language_loss": 0.77424788, "learning_rate": 1.579402979763989e-06, "loss": 0.79588759, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.5885190963745117 }, { "auxiliary_loss_clip": 0.01111822, "auxiliary_loss_mlp": 0.01023958, "balance_loss_clip": 1.04587209, "balance_loss_mlp": 1.0168891, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 1.9931154863154255, "language_loss": 0.81127322, "learning_rate": 1.578641460008548e-06, "loss": 0.83263099, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.589778184890747 }, { "auxiliary_loss_clip": 0.01155845, "auxiliary_loss_mlp": 0.01022634, "balance_loss_clip": 1.0506146, "balance_loss_mlp": 1.01590729, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 2.151955386236936, "language_loss": 0.67818069, "learning_rate": 1.5778800041728613e-06, "loss": 0.69996548, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.475308418273926 }, { "auxiliary_loss_clip": 0.01155533, "auxiliary_loss_mlp": 0.01022477, "balance_loss_clip": 1.05131602, "balance_loss_mlp": 1.01598907, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.4840353178405017, "language_loss": 0.66206712, "learning_rate": 1.577118612372443e-06, "loss": 0.68384719, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 2.563689708709717 }, { "auxiliary_loss_clip": 0.01139842, "auxiliary_loss_mlp": 0.00761613, "balance_loss_clip": 1.04609966, "balance_loss_mlp": 1.0001992, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 1.804626099858776, "language_loss": 0.7054882, "learning_rate": 1.5763572847227943e-06, "loss": 0.7245028, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.68552303314209 }, { "auxiliary_loss_clip": 0.01153853, "auxiliary_loss_mlp": 0.01021301, "balance_loss_clip": 1.04659534, "balance_loss_mlp": 1.01490188, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 1.7830408934785524, "language_loss": 0.81329179, "learning_rate": 1.5755960213394091e-06, "loss": 0.83504337, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.511032819747925 }, { "auxiliary_loss_clip": 0.01129885, "auxiliary_loss_mlp": 0.01026376, "balance_loss_clip": 1.04633427, "balance_loss_mlp": 1.01909828, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 1.736493882798787, "language_loss": 0.7818318, "learning_rate": 1.5748348223377703e-06, "loss": 0.80339444, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.550215482711792 }, { "auxiliary_loss_clip": 0.01140542, "auxiliary_loss_mlp": 0.0102773, "balance_loss_clip": 1.04910111, "balance_loss_mlp": 1.02049422, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 1.6290558694031396, "language_loss": 0.77882922, "learning_rate": 1.5740736878333507e-06, "loss": 0.80051196, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.517512798309326 }, { "auxiliary_loss_clip": 0.01145884, "auxiliary_loss_mlp": 0.01027227, "balance_loss_clip": 1.04747295, "balance_loss_mlp": 1.02022052, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 2.6649396195790223, "language_loss": 0.7789712, "learning_rate": 1.5733126179416143e-06, "loss": 0.80070233, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.5310263633728027 }, { "auxiliary_loss_clip": 0.01156111, "auxiliary_loss_mlp": 0.01029591, "balance_loss_clip": 1.05075371, "balance_loss_mlp": 1.02220941, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.203887264801454, "language_loss": 0.72258753, "learning_rate": 1.5725516127780137e-06, "loss": 0.74444455, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.614704132080078 }, { "auxiliary_loss_clip": 0.01162972, "auxiliary_loss_mlp": 0.0102463, "balance_loss_clip": 1.0497992, "balance_loss_mlp": 1.01742387, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 2.0437911840252108, "language_loss": 0.88016498, "learning_rate": 1.5717906724579943e-06, "loss": 0.90204096, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.4780101776123047 }, { "auxiliary_loss_clip": 0.0113424, "auxiliary_loss_mlp": 0.01025279, "balance_loss_clip": 1.04548025, "balance_loss_mlp": 1.01865363, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 2.1670632958790486, "language_loss": 0.68286425, "learning_rate": 1.571029797096989e-06, "loss": 0.70445943, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.670537233352661 }, { "auxiliary_loss_clip": 0.01168804, "auxiliary_loss_mlp": 0.01023649, "balance_loss_clip": 1.05059958, "balance_loss_mlp": 1.01640701, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 1.7255514903728824, "language_loss": 0.78619874, "learning_rate": 1.570268986810423e-06, "loss": 0.80812323, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 3.247474193572998 }, { "auxiliary_loss_clip": 0.0114179, "auxiliary_loss_mlp": 0.01028851, "balance_loss_clip": 1.04893041, "balance_loss_mlp": 1.02171588, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 1.8612386448662739, "language_loss": 0.74944997, "learning_rate": 1.5695082417137096e-06, "loss": 0.77115643, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.5304155349731445 }, { "auxiliary_loss_clip": 0.01139625, "auxiliary_loss_mlp": 0.01026164, "balance_loss_clip": 1.04583526, "balance_loss_mlp": 1.01907134, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 1.5054334857171527, "language_loss": 0.74980873, "learning_rate": 1.5687475619222539e-06, "loss": 0.77146661, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.5731375217437744 }, { "auxiliary_loss_clip": 0.01134502, "auxiliary_loss_mlp": 0.01032783, "balance_loss_clip": 1.0439682, "balance_loss_mlp": 1.02535069, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 2.705172044043733, "language_loss": 0.73709035, "learning_rate": 1.5679869475514496e-06, "loss": 0.75876319, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 2.553636074066162 }, { "auxiliary_loss_clip": 0.01160128, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 1.05244398, "balance_loss_mlp": 1.02094173, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 1.9946328162978828, "language_loss": 0.81211591, "learning_rate": 1.567226398716682e-06, "loss": 0.83400154, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 2.5081214904785156 }, { "auxiliary_loss_clip": 0.0114744, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 1.04703403, "balance_loss_mlp": 1.02057028, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.855424445451852, "language_loss": 0.62014365, "learning_rate": 1.566465915533326e-06, "loss": 0.6418972, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 2.643021583557129 }, { "auxiliary_loss_clip": 0.01153604, "auxiliary_loss_mlp": 0.0103041, "balance_loss_clip": 1.04874587, "balance_loss_mlp": 1.02296507, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 1.9382944751951445, "language_loss": 0.88139904, "learning_rate": 1.5657054981167458e-06, "loss": 0.90323919, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 3.2869231700897217 }, { "auxiliary_loss_clip": 0.0115611, "auxiliary_loss_mlp": 0.01026086, "balance_loss_clip": 1.05037045, "balance_loss_mlp": 1.01892161, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 2.0815856332815392, "language_loss": 0.67584372, "learning_rate": 1.5649451465822965e-06, "loss": 0.69766569, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 2.611398220062256 }, { "auxiliary_loss_clip": 0.01113912, "auxiliary_loss_mlp": 0.01029048, "balance_loss_clip": 1.04682064, "balance_loss_mlp": 1.02206492, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.6453948273032635, "language_loss": 0.8337611, "learning_rate": 1.5641848610453218e-06, "loss": 0.85519069, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.560307264328003 }, { "auxiliary_loss_clip": 0.01154072, "auxiliary_loss_mlp": 0.01026838, "balance_loss_clip": 1.05106735, "balance_loss_mlp": 1.01930428, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 1.9090144995555804, "language_loss": 0.85794461, "learning_rate": 1.563424641621158e-06, "loss": 0.87975371, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 2.5104312896728516 }, { "auxiliary_loss_clip": 0.01146396, "auxiliary_loss_mlp": 0.01024782, "balance_loss_clip": 1.04857326, "balance_loss_mlp": 1.01637816, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 2.2221618015082427, "language_loss": 0.69987863, "learning_rate": 1.5626644884251282e-06, "loss": 0.7215904, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 3.3997108936309814 }, { "auxiliary_loss_clip": 0.01169659, "auxiliary_loss_mlp": 0.01024323, "balance_loss_clip": 1.05113292, "balance_loss_mlp": 1.01707792, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 1.74805622058221, "language_loss": 0.8809036, "learning_rate": 1.5619044015725488e-06, "loss": 0.90284348, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.4941868782043457 }, { "auxiliary_loss_clip": 0.01178758, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.05652821, "balance_loss_mlp": 1.02315271, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 2.2189850639961706, "language_loss": 0.87270528, "learning_rate": 1.5611443811787224e-06, "loss": 0.89480329, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 2.434347629547119 }, { "auxiliary_loss_clip": 0.0115671, "auxiliary_loss_mlp": 0.01028124, "balance_loss_clip": 1.05247247, "balance_loss_mlp": 1.02157044, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 2.168894311170541, "language_loss": 0.69712985, "learning_rate": 1.560384427358945e-06, "loss": 0.71897817, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.489314317703247 }, { "auxiliary_loss_clip": 0.01135257, "auxiliary_loss_mlp": 0.01027829, "balance_loss_clip": 1.04399776, "balance_loss_mlp": 1.02024138, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 1.4829691165411758, "language_loss": 0.73171854, "learning_rate": 1.5596245402284998e-06, "loss": 0.75334936, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.5984067916870117 }, { "auxiliary_loss_clip": 0.01159588, "auxiliary_loss_mlp": 0.01028992, "balance_loss_clip": 1.05269468, "balance_loss_mlp": 1.02154756, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 1.7514720525333853, "language_loss": 0.81907368, "learning_rate": 1.5588647199026619e-06, "loss": 0.84095943, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.4673664569854736 }, { "auxiliary_loss_clip": 0.01175494, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.05386484, "balance_loss_mlp": 1.01910067, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 2.0695717459997294, "language_loss": 0.87748593, "learning_rate": 1.5581049664966956e-06, "loss": 0.89950371, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.442946672439575 }, { "auxiliary_loss_clip": 0.01011156, "auxiliary_loss_mlp": 0.01002434, "balance_loss_clip": 1.0102632, "balance_loss_mlp": 1.00127149, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 0.9904263493673009, "language_loss": 0.65204132, "learning_rate": 1.5573452801258545e-06, "loss": 0.6721772, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 3.0067784786224365 }, { "auxiliary_loss_clip": 0.01164211, "auxiliary_loss_mlp": 0.01033255, "balance_loss_clip": 1.05262136, "balance_loss_mlp": 1.02583992, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 1.9455751269205175, "language_loss": 0.63345504, "learning_rate": 1.5565856609053824e-06, "loss": 0.65542966, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.505228042602539 }, { "auxiliary_loss_clip": 0.01171769, "auxiliary_loss_mlp": 0.01030231, "balance_loss_clip": 1.05277121, "balance_loss_mlp": 1.02264333, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 38.60943201965022, "language_loss": 0.79598427, "learning_rate": 1.5558261089505127e-06, "loss": 0.81800425, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.449418306350708 }, { "auxiliary_loss_clip": 0.01157949, "auxiliary_loss_mlp": 0.01029704, "balance_loss_clip": 1.05342555, "balance_loss_mlp": 1.02258992, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 1.8760304064627622, "language_loss": 0.80200237, "learning_rate": 1.5550666243764697e-06, "loss": 0.82387888, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.5435678958892822 }, { "auxiliary_loss_clip": 0.01156838, "auxiliary_loss_mlp": 0.01027127, "balance_loss_clip": 1.05095088, "balance_loss_mlp": 1.01989126, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 2.1883210242533027, "language_loss": 0.77089959, "learning_rate": 1.554307207298465e-06, "loss": 0.79273921, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.467388868331909 }, { "auxiliary_loss_clip": 0.01174378, "auxiliary_loss_mlp": 0.01033908, "balance_loss_clip": 1.05320644, "balance_loss_mlp": 1.02593541, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 3.880492985685576, "language_loss": 0.78467155, "learning_rate": 1.553547857831704e-06, "loss": 0.80675441, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.479334831237793 }, { "auxiliary_loss_clip": 0.01060401, "auxiliary_loss_mlp": 0.0100135, "balance_loss_clip": 1.01187062, "balance_loss_mlp": 1.0002594, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 1.5086142326966725, "language_loss": 0.64268041, "learning_rate": 1.5527885760913771e-06, "loss": 0.66329801, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 2.901493787765503 }, { "auxiliary_loss_clip": 0.0114583, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.05089855, "balance_loss_mlp": 1.02503991, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 1.781349329639557, "language_loss": 0.76385748, "learning_rate": 1.552029362192668e-06, "loss": 0.7856313, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.5201826095581055 }, { "auxiliary_loss_clip": 0.01127093, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.04727793, "balance_loss_mlp": 1.02537227, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 1.9242041392750093, "language_loss": 0.72349977, "learning_rate": 1.5512702162507478e-06, "loss": 0.74509567, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.594768762588501 }, { "auxiliary_loss_clip": 0.01040852, "auxiliary_loss_mlp": 0.01001173, "balance_loss_clip": 1.01238108, "balance_loss_mlp": 1.0001297, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.1278782501392024, "language_loss": 0.55818605, "learning_rate": 1.5505111383807792e-06, "loss": 0.57860625, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.936511516571045 }, { "auxiliary_loss_clip": 0.01116636, "auxiliary_loss_mlp": 0.01022055, "balance_loss_clip": 1.04273975, "balance_loss_mlp": 1.01515293, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 1.7591303506194025, "language_loss": 0.80455691, "learning_rate": 1.5497521286979138e-06, "loss": 0.82594383, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.6219053268432617 }, { "auxiliary_loss_clip": 0.01134632, "auxiliary_loss_mlp": 0.0102679, "balance_loss_clip": 1.04861307, "balance_loss_mlp": 1.01936293, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 2.385620986582856, "language_loss": 0.74255443, "learning_rate": 1.5489931873172927e-06, "loss": 0.76416862, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.5996220111846924 }, { "auxiliary_loss_clip": 0.0108198, "auxiliary_loss_mlp": 0.010258, "balance_loss_clip": 1.03809953, "balance_loss_mlp": 1.01817071, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 1.6898278836169136, "language_loss": 0.7918334, "learning_rate": 1.5482343143540467e-06, "loss": 0.81291115, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.6843981742858887 }, { "auxiliary_loss_clip": 0.01128419, "auxiliary_loss_mlp": 0.00760852, "balance_loss_clip": 1.04610157, "balance_loss_mlp": 1.00020313, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 1.9137062332089865, "language_loss": 0.82431877, "learning_rate": 1.547475509923295e-06, "loss": 0.84321153, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 2.5343446731567383 }, { "auxiliary_loss_clip": 0.0101965, "auxiliary_loss_mlp": 0.00999801, "balance_loss_clip": 1.01192594, "balance_loss_mlp": 0.99870473, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.7328556917102086, "language_loss": 0.56109107, "learning_rate": 1.5467167741401495e-06, "loss": 0.5812856, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 3.1773455142974854 }, { "auxiliary_loss_clip": 0.01140969, "auxiliary_loss_mlp": 0.01029339, "balance_loss_clip": 1.04522038, "balance_loss_mlp": 1.02212405, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 1.980483207224238, "language_loss": 0.7140227, "learning_rate": 1.5459581071197083e-06, "loss": 0.73572576, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 3.297919511795044 }, { "auxiliary_loss_clip": 0.01162515, "auxiliary_loss_mlp": 0.01029342, "balance_loss_clip": 1.05389869, "balance_loss_mlp": 1.02180839, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.0890385543943912, "language_loss": 0.83430851, "learning_rate": 1.5451995089770624e-06, "loss": 0.85622716, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 2.509453058242798 }, { "auxiliary_loss_clip": 0.01172632, "auxiliary_loss_mlp": 0.01023995, "balance_loss_clip": 1.05307484, "balance_loss_mlp": 1.01723254, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 1.3105149828120086, "language_loss": 0.72061276, "learning_rate": 1.5444409798272885e-06, "loss": 0.74257898, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 2.5764496326446533 }, { "auxiliary_loss_clip": 0.01131271, "auxiliary_loss_mlp": 0.01027395, "balance_loss_clip": 1.04639709, "balance_loss_mlp": 1.01955712, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 1.750430969542323, "language_loss": 0.8051101, "learning_rate": 1.543682519785456e-06, "loss": 0.82669675, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.5816845893859863 }, { "auxiliary_loss_clip": 0.01142874, "auxiliary_loss_mlp": 0.01026422, "balance_loss_clip": 1.04789639, "balance_loss_mlp": 1.01961482, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 2.4378225309203603, "language_loss": 0.80179513, "learning_rate": 1.5429241289666219e-06, "loss": 0.823488, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 4.095436096191406 }, { "auxiliary_loss_clip": 0.0113479, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.04615784, "balance_loss_mlp": 1.02365017, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 1.9970828984536362, "language_loss": 0.69383073, "learning_rate": 1.5421658074858342e-06, "loss": 0.71548915, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 2.5670664310455322 }, { "auxiliary_loss_clip": 0.01138719, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.04754591, "balance_loss_mlp": 1.0252552, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 2.182059164787899, "language_loss": 0.66334164, "learning_rate": 1.5414075554581298e-06, "loss": 0.68505728, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 2.533712387084961 }, { "auxiliary_loss_clip": 0.01172528, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 1.05179, "balance_loss_mlp": 1.02023566, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 2.367192826429621, "language_loss": 0.78074396, "learning_rate": 1.5406493729985348e-06, "loss": 0.80274773, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.516009569168091 }, { "auxiliary_loss_clip": 0.01116542, "auxiliary_loss_mlp": 0.00760687, "balance_loss_clip": 1.04660869, "balance_loss_mlp": 1.00016594, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 2.222934135137192, "language_loss": 0.71407384, "learning_rate": 1.5398912602220644e-06, "loss": 0.73284614, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.646860122680664 }, { "auxiliary_loss_clip": 0.0112232, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 1.04613137, "balance_loss_mlp": 1.02063704, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 4.128134555490838, "language_loss": 0.79115534, "learning_rate": 1.539133217243724e-06, "loss": 0.81265867, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.557936906814575 }, { "auxiliary_loss_clip": 0.01136185, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.04588652, "balance_loss_mlp": 1.02208996, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 4.2087026675962536, "language_loss": 0.76247954, "learning_rate": 1.5383752441785081e-06, "loss": 0.78414607, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.606729030609131 }, { "auxiliary_loss_clip": 0.01164171, "auxiliary_loss_mlp": 0.01031154, "balance_loss_clip": 1.05215085, "balance_loss_mlp": 1.02350116, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.2060155545073927, "language_loss": 0.85456866, "learning_rate": 1.5376173411414003e-06, "loss": 0.87652194, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.474823236465454 }, { "auxiliary_loss_clip": 0.01143543, "auxiliary_loss_mlp": 0.01029326, "balance_loss_clip": 1.0452373, "balance_loss_mlp": 1.02216446, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 1.884649094622077, "language_loss": 0.78933263, "learning_rate": 1.5368595082473753e-06, "loss": 0.81106132, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 2.5613725185394287 }, { "auxiliary_loss_clip": 0.0115608, "auxiliary_loss_mlp": 0.01022139, "balance_loss_clip": 1.04795837, "balance_loss_mlp": 1.01514721, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.7293067771025656, "language_loss": 0.78209245, "learning_rate": 1.5361017456113935e-06, "loss": 0.80387461, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.5066311359405518 }, { "auxiliary_loss_clip": 0.01157438, "auxiliary_loss_mlp": 0.01027175, "balance_loss_clip": 1.04807043, "balance_loss_mlp": 1.01930165, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 2.019789961321854, "language_loss": 0.85467267, "learning_rate": 1.5353440533484085e-06, "loss": 0.87651879, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.512587070465088 }, { "auxiliary_loss_clip": 0.0114568, "auxiliary_loss_mlp": 0.01025608, "balance_loss_clip": 1.04821312, "balance_loss_mlp": 1.01761544, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 1.6979826261719433, "language_loss": 0.66163158, "learning_rate": 1.534586431573361e-06, "loss": 0.68334442, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 2.807114362716675 }, { "auxiliary_loss_clip": 0.01104595, "auxiliary_loss_mlp": 0.01031011, "balance_loss_clip": 1.04142225, "balance_loss_mlp": 1.0232203, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 1.6898003988532517, "language_loss": 0.7902602, "learning_rate": 1.5338288804011817e-06, "loss": 0.81161624, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.6721949577331543 }, { "auxiliary_loss_clip": 0.01138439, "auxiliary_loss_mlp": 0.0102674, "balance_loss_clip": 1.04510462, "balance_loss_mlp": 1.01922941, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 1.919845247092281, "language_loss": 0.71005845, "learning_rate": 1.533071399946791e-06, "loss": 0.73171031, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.518092393875122 }, { "auxiliary_loss_clip": 0.01142624, "auxiliary_loss_mlp": 0.01028185, "balance_loss_clip": 1.04807734, "balance_loss_mlp": 1.02096677, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 2.153065227257642, "language_loss": 0.57670295, "learning_rate": 1.5323139903250977e-06, "loss": 0.59841108, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.5541951656341553 }, { "auxiliary_loss_clip": 0.01148484, "auxiliary_loss_mlp": 0.0102826, "balance_loss_clip": 1.05215096, "balance_loss_mlp": 1.02079153, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.4786514369628596, "language_loss": 0.77139497, "learning_rate": 1.5315566516510002e-06, "loss": 0.79316235, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.535137891769409 }, { "auxiliary_loss_clip": 0.01172306, "auxiliary_loss_mlp": 0.01025783, "balance_loss_clip": 1.05195308, "balance_loss_mlp": 1.01837087, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 1.6585637110155793, "language_loss": 0.67828214, "learning_rate": 1.5307993840393857e-06, "loss": 0.70026302, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 3.297316789627075 }, { "auxiliary_loss_clip": 0.01169822, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.0504024, "balance_loss_mlp": 1.02044749, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 1.858044534124567, "language_loss": 0.80612922, "learning_rate": 1.530042187605132e-06, "loss": 0.82809949, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.4902749061584473 }, { "auxiliary_loss_clip": 0.0115904, "auxiliary_loss_mlp": 0.00760005, "balance_loss_clip": 1.05031109, "balance_loss_mlp": 1.00013566, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 1.4063164877226104, "language_loss": 0.84168983, "learning_rate": 1.5292850624631044e-06, "loss": 0.86088026, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.5578320026397705 }, { "auxiliary_loss_clip": 0.01151644, "auxiliary_loss_mlp": 0.01030909, "balance_loss_clip": 1.04943359, "balance_loss_mlp": 1.02373266, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 1.8820540599268687, "language_loss": 0.80111945, "learning_rate": 1.5285280087281593e-06, "loss": 0.822945, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.5754315853118896 }, { "auxiliary_loss_clip": 0.01042978, "auxiliary_loss_mlp": 0.01008085, "balance_loss_clip": 1.01260996, "balance_loss_mlp": 1.00676811, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 0.6482687509409895, "language_loss": 0.56631756, "learning_rate": 1.5277710265151398e-06, "loss": 0.58682823, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 3.283946990966797 }, { "auxiliary_loss_clip": 0.01156842, "auxiliary_loss_mlp": 0.01029021, "balance_loss_clip": 1.0489471, "balance_loss_mlp": 1.02111745, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 2.920746522994618, "language_loss": 0.77364838, "learning_rate": 1.5270141159388803e-06, "loss": 0.79550707, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 2.4842166900634766 }, { "auxiliary_loss_clip": 0.01169999, "auxiliary_loss_mlp": 0.01029543, "balance_loss_clip": 1.05002594, "balance_loss_mlp": 1.02151442, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 1.6388209494567, "language_loss": 0.80499077, "learning_rate": 1.526257277114203e-06, "loss": 0.82698613, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 2.4983887672424316 }, { "auxiliary_loss_clip": 0.01141942, "auxiliary_loss_mlp": 0.01025302, "balance_loss_clip": 1.04969406, "balance_loss_mlp": 1.01808071, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 1.8278107779294945, "language_loss": 0.79342842, "learning_rate": 1.5255005101559201e-06, "loss": 0.81510085, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 3.324629545211792 }, { "auxiliary_loss_clip": 0.01157671, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.04963076, "balance_loss_mlp": 1.02480328, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 1.8676212334019406, "language_loss": 0.76633704, "learning_rate": 1.524743815178833e-06, "loss": 0.7882303, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 2.5018045902252197 }, { "auxiliary_loss_clip": 0.01143417, "auxiliary_loss_mlp": 0.01030543, "balance_loss_clip": 1.04561651, "balance_loss_mlp": 1.02351832, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 1.8049702374308343, "language_loss": 0.8092925, "learning_rate": 1.5239871922977315e-06, "loss": 0.8310321, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.5529839992523193 }, { "auxiliary_loss_clip": 0.01142887, "auxiliary_loss_mlp": 0.010274, "balance_loss_clip": 1.04764831, "balance_loss_mlp": 1.02028966, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 1.6703471734701048, "language_loss": 0.89424366, "learning_rate": 1.523230641627394e-06, "loss": 0.9159466, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 4.175079107284546 }, { "auxiliary_loss_clip": 0.0111598, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 1.04138231, "balance_loss_mlp": 1.02192271, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 1.8356308475899945, "language_loss": 0.72687101, "learning_rate": 1.5224741632825888e-06, "loss": 0.74832076, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.662339687347412 }, { "auxiliary_loss_clip": 0.01176102, "auxiliary_loss_mlp": 0.01028912, "balance_loss_clip": 1.0546, "balance_loss_mlp": 1.02137244, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 1.7053620440638568, "language_loss": 0.69285369, "learning_rate": 1.521717757378074e-06, "loss": 0.71490377, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 2.6617751121520996 }, { "auxiliary_loss_clip": 0.01163631, "auxiliary_loss_mlp": 0.01030485, "balance_loss_clip": 1.0514946, "balance_loss_mlp": 1.0225457, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 1.9114130241308882, "language_loss": 0.69196153, "learning_rate": 1.5209614240285943e-06, "loss": 0.71390265, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 2.483524799346924 }, { "auxiliary_loss_clip": 0.01171919, "auxiliary_loss_mlp": 0.00761188, "balance_loss_clip": 1.05192935, "balance_loss_mlp": 1.00017202, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 1.9247187646404937, "language_loss": 0.84784108, "learning_rate": 1.520205163348887e-06, "loss": 0.86717212, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.478612184524536 }, { "auxiliary_loss_clip": 0.01036183, "auxiliary_loss_mlp": 0.01000649, "balance_loss_clip": 1.01423967, "balance_loss_mlp": 0.99936724, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7258831705606584, "language_loss": 0.5692696, "learning_rate": 1.519448975453674e-06, "loss": 0.58963799, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.023139238357544 }, { "auxiliary_loss_clip": 0.01164859, "auxiliary_loss_mlp": 0.0076138, "balance_loss_clip": 1.05655003, "balance_loss_mlp": 1.00018668, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 2.1740961821739986, "language_loss": 0.75565284, "learning_rate": 1.5186928604576696e-06, "loss": 0.77491522, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.497197389602661 }, { "auxiliary_loss_clip": 0.01144302, "auxiliary_loss_mlp": 0.01031222, "balance_loss_clip": 1.0468775, "balance_loss_mlp": 1.02392614, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.0793663183361346, "language_loss": 0.77203268, "learning_rate": 1.5179368184755752e-06, "loss": 0.7937879, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.5226454734802246 }, { "auxiliary_loss_clip": 0.01142521, "auxiliary_loss_mlp": 0.01022716, "balance_loss_clip": 1.04854357, "balance_loss_mlp": 1.0159986, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 1.6045343990891583, "language_loss": 0.82569194, "learning_rate": 1.5171808496220821e-06, "loss": 0.84734428, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 2.543941020965576 }, { "auxiliary_loss_clip": 0.0114885, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 1.04810178, "balance_loss_mlp": 1.02239537, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.7016647151417654, "language_loss": 0.81568986, "learning_rate": 1.5164249540118708e-06, "loss": 0.83747518, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 2.560776948928833 }, { "auxiliary_loss_clip": 0.01106361, "auxiliary_loss_mlp": 0.0102794, "balance_loss_clip": 1.04239416, "balance_loss_mlp": 1.02046847, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 1.541881026419899, "language_loss": 0.83108616, "learning_rate": 1.5156691317596093e-06, "loss": 0.85242915, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.6541121006011963 }, { "auxiliary_loss_clip": 0.01157263, "auxiliary_loss_mlp": 0.00760177, "balance_loss_clip": 1.04877138, "balance_loss_mlp": 1.00019825, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 2.0532310463698433, "language_loss": 0.6626054, "learning_rate": 1.5149133829799556e-06, "loss": 0.6817798, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.57694149017334 }, { "auxiliary_loss_clip": 0.01148009, "auxiliary_loss_mlp": 0.0102452, "balance_loss_clip": 1.04926372, "balance_loss_mlp": 1.01772475, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 2.137545618444139, "language_loss": 0.80904931, "learning_rate": 1.5141577077875556e-06, "loss": 0.83077461, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.5129330158233643 }, { "auxiliary_loss_clip": 0.01160855, "auxiliary_loss_mlp": 0.01025352, "balance_loss_clip": 1.05124164, "balance_loss_mlp": 1.01846802, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 1.8266482896711302, "language_loss": 0.72268641, "learning_rate": 1.5134021062970451e-06, "loss": 0.7445485, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.5013575553894043 }, { "auxiliary_loss_clip": 0.01121393, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.04867816, "balance_loss_mlp": 1.02569056, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 1.918777874394368, "language_loss": 0.81061244, "learning_rate": 1.5126465786230483e-06, "loss": 0.83215749, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.5504186153411865 }, { "auxiliary_loss_clip": 0.01171692, "auxiliary_loss_mlp": 0.01028517, "balance_loss_clip": 1.05151987, "balance_loss_mlp": 1.02131629, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 1.6988681741621354, "language_loss": 0.82101023, "learning_rate": 1.5118911248801787e-06, "loss": 0.84301233, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.52847957611084 }, { "auxiliary_loss_clip": 0.01153985, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.04909873, "balance_loss_mlp": 1.02406716, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 2.8385331723413287, "language_loss": 0.79594541, "learning_rate": 1.5111357451830364e-06, "loss": 0.81779313, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 3.2237794399261475 }, { "auxiliary_loss_clip": 0.01156829, "auxiliary_loss_mlp": 0.0102918, "balance_loss_clip": 1.04910982, "balance_loss_mlp": 1.02286792, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 1.8364539734887542, "language_loss": 0.71392214, "learning_rate": 1.5103804396462131e-06, "loss": 0.73578227, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.522834062576294 }, { "auxiliary_loss_clip": 0.01158961, "auxiliary_loss_mlp": 0.01029614, "balance_loss_clip": 1.04752111, "balance_loss_mlp": 1.02168667, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 2.2966766142768775, "language_loss": 0.79817551, "learning_rate": 1.5096252083842877e-06, "loss": 0.82006127, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.529672384262085 }, { "auxiliary_loss_clip": 0.01154649, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.0469259, "balance_loss_mlp": 1.02026558, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 1.7225577234607803, "language_loss": 0.85396218, "learning_rate": 1.5088700515118285e-06, "loss": 0.87578803, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.5839152336120605 }, { "auxiliary_loss_clip": 0.01126847, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.04669333, "balance_loss_mlp": 1.02526963, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 1.650192691341449, "language_loss": 0.66642636, "learning_rate": 1.508114969143392e-06, "loss": 0.688025, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 2.5942749977111816 }, { "auxiliary_loss_clip": 0.01140232, "auxiliary_loss_mlp": 0.01025666, "balance_loss_clip": 1.04396558, "balance_loss_mlp": 1.01908588, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 1.5231740737225965, "language_loss": 0.77531517, "learning_rate": 1.5073599613935238e-06, "loss": 0.79697418, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 2.6173675060272217 }, { "auxiliary_loss_clip": 0.01141613, "auxiliary_loss_mlp": 0.01030431, "balance_loss_clip": 1.04745555, "balance_loss_mlp": 1.02308774, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 1.871509948582554, "language_loss": 0.57680422, "learning_rate": 1.5066050283767574e-06, "loss": 0.59852463, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.5840277671813965 }, { "auxiliary_loss_clip": 0.0113438, "auxiliary_loss_mlp": 0.0102445, "balance_loss_clip": 1.04604959, "balance_loss_mlp": 1.01751459, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 1.8455383663932154, "language_loss": 0.82635844, "learning_rate": 1.505850170207616e-06, "loss": 0.84794682, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 3.261245012283325 }, { "auxiliary_loss_clip": 0.01142211, "auxiliary_loss_mlp": 0.01025952, "balance_loss_clip": 1.04627705, "balance_loss_mlp": 1.0190289, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 1.9221368445915812, "language_loss": 0.78357834, "learning_rate": 1.505095387000611e-06, "loss": 0.80525994, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.61132550239563 }, { "auxiliary_loss_clip": 0.01134717, "auxiliary_loss_mlp": 0.01024046, "balance_loss_clip": 1.04788673, "balance_loss_mlp": 1.01723325, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 2.072709458397679, "language_loss": 0.74224973, "learning_rate": 1.504340678870242e-06, "loss": 0.7638374, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.555328369140625 }, { "auxiliary_loss_clip": 0.01154651, "auxiliary_loss_mlp": 0.01026527, "balance_loss_clip": 1.048069, "balance_loss_mlp": 1.01861119, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 1.9015707812745677, "language_loss": 0.89757496, "learning_rate": 1.5035860459309989e-06, "loss": 0.91938674, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 4.063888788223267 }, { "auxiliary_loss_clip": 0.01139904, "auxiliary_loss_mlp": 0.01031259, "balance_loss_clip": 1.04792452, "balance_loss_mlp": 1.02351642, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 2.0267810449516404, "language_loss": 0.6329267, "learning_rate": 1.5028314882973568e-06, "loss": 0.65463841, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 2.571357250213623 }, { "auxiliary_loss_clip": 0.01143082, "auxiliary_loss_mlp": 0.01026323, "balance_loss_clip": 1.04780102, "balance_loss_mlp": 1.01880956, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 1.867633896170128, "language_loss": 0.84454769, "learning_rate": 1.502077006083783e-06, "loss": 0.86624175, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 2.5418643951416016 }, { "auxiliary_loss_clip": 0.01158414, "auxiliary_loss_mlp": 0.00760299, "balance_loss_clip": 1.05166495, "balance_loss_mlp": 1.00017071, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 1.7674895595216809, "language_loss": 0.767335, "learning_rate": 1.5013225994047315e-06, "loss": 0.78652209, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.5073676109313965 }, { "auxiliary_loss_clip": 0.01158604, "auxiliary_loss_mlp": 0.00760461, "balance_loss_clip": 1.05113912, "balance_loss_mlp": 1.00013125, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 1.8447548684730877, "language_loss": 0.81064022, "learning_rate": 1.5005682683746452e-06, "loss": 0.82983088, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.541358470916748 }, { "auxiliary_loss_clip": 0.01157569, "auxiliary_loss_mlp": 0.01028442, "balance_loss_clip": 1.05161786, "balance_loss_mlp": 1.02112269, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 2.2507506290493606, "language_loss": 0.72738862, "learning_rate": 1.4998140131079553e-06, "loss": 0.74924874, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.460134744644165 }, { "auxiliary_loss_clip": 0.01097771, "auxiliary_loss_mlp": 0.00760554, "balance_loss_clip": 1.04351926, "balance_loss_mlp": 1.00014257, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 1.7621857394681002, "language_loss": 0.73521745, "learning_rate": 1.4990598337190821e-06, "loss": 0.75380069, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.6184725761413574 }, { "auxiliary_loss_clip": 0.01170578, "auxiliary_loss_mlp": 0.00760694, "balance_loss_clip": 1.05137968, "balance_loss_mlp": 1.00011766, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 1.6769025440705971, "language_loss": 0.6779772, "learning_rate": 1.4983057303224338e-06, "loss": 0.69728994, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.5065150260925293 }, { "auxiliary_loss_clip": 0.01114978, "auxiliary_loss_mlp": 0.01027065, "balance_loss_clip": 1.04522574, "balance_loss_mlp": 1.02006137, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 1.602730917779371, "language_loss": 0.87558407, "learning_rate": 1.4975517030324072e-06, "loss": 0.89700449, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 2.6094958782196045 }, { "auxiliary_loss_clip": 0.01063837, "auxiliary_loss_mlp": 0.00751368, "balance_loss_clip": 1.01494527, "balance_loss_mlp": 1.00017822, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7814877092464576, "language_loss": 0.61851883, "learning_rate": 1.4967977519633882e-06, "loss": 0.63667083, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.1946847438812256 }, { "auxiliary_loss_clip": 0.01128936, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.04664278, "balance_loss_mlp": 1.02470791, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 1.814706649215006, "language_loss": 0.78118503, "learning_rate": 1.4960438772297494e-06, "loss": 0.80279779, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.5947492122650146 }, { "auxiliary_loss_clip": 0.0114614, "auxiliary_loss_mlp": 0.01028455, "balance_loss_clip": 1.04608941, "balance_loss_mlp": 1.02064085, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 2.1796965352311806, "language_loss": 0.73670471, "learning_rate": 1.495290078945855e-06, "loss": 0.75845063, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.66733455657959 }, { "auxiliary_loss_clip": 0.0116917, "auxiliary_loss_mlp": 0.01026495, "balance_loss_clip": 1.05083203, "balance_loss_mlp": 1.01936936, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 1.7260550078194208, "language_loss": 0.74305558, "learning_rate": 1.4945363572260529e-06, "loss": 0.76501215, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.665654420852661 }, { "auxiliary_loss_clip": 0.01153699, "auxiliary_loss_mlp": 0.01021097, "balance_loss_clip": 1.04753351, "balance_loss_mlp": 1.01372373, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 2.0561318723523363, "language_loss": 0.68349516, "learning_rate": 1.4937827121846845e-06, "loss": 0.70524311, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.5404906272888184 }, { "auxiliary_loss_clip": 0.01120281, "auxiliary_loss_mlp": 0.01032336, "balance_loss_clip": 1.04523039, "balance_loss_mlp": 1.02506471, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 1.4974181291318573, "language_loss": 0.7343905, "learning_rate": 1.4930291439360755e-06, "loss": 0.75591671, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.6068835258483887 }, { "auxiliary_loss_clip": 0.01157658, "auxiliary_loss_mlp": 0.010319, "balance_loss_clip": 1.05024934, "balance_loss_mlp": 1.02437174, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 1.8020657355664922, "language_loss": 0.79431498, "learning_rate": 1.4922756525945427e-06, "loss": 0.81621057, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.5212016105651855 }, { "auxiliary_loss_clip": 0.0105153, "auxiliary_loss_mlp": 0.01011245, "balance_loss_clip": 1.01549101, "balance_loss_mlp": 1.01012468, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 0.9416388444901403, "language_loss": 0.59596115, "learning_rate": 1.4915222382743894e-06, "loss": 0.61658889, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.88228702545166 }, { "auxiliary_loss_clip": 0.01157404, "auxiliary_loss_mlp": 0.01036981, "balance_loss_clip": 1.05152106, "balance_loss_mlp": 1.0292089, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 2.5368862216964527, "language_loss": 0.71964741, "learning_rate": 1.4907689010899085e-06, "loss": 0.74159127, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.477567434310913 }, { "auxiliary_loss_clip": 0.01141904, "auxiliary_loss_mlp": 0.01024615, "balance_loss_clip": 1.04816771, "balance_loss_mlp": 1.01770401, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 1.838468174874239, "language_loss": 0.62560189, "learning_rate": 1.4900156411553804e-06, "loss": 0.64726704, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.577545166015625 }, { "auxiliary_loss_clip": 0.01146134, "auxiliary_loss_mlp": 0.0103382, "balance_loss_clip": 1.04975331, "balance_loss_mlp": 1.02671194, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 2.042271934224919, "language_loss": 0.85566735, "learning_rate": 1.4892624585850739e-06, "loss": 0.87746692, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 2.5260186195373535 }, { "auxiliary_loss_clip": 0.01173348, "auxiliary_loss_mlp": 0.010263, "balance_loss_clip": 1.05173409, "balance_loss_mlp": 1.01920664, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 2.160239026251684, "language_loss": 0.79617047, "learning_rate": 1.4885093534932465e-06, "loss": 0.81816697, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.499295711517334 }, { "auxiliary_loss_clip": 0.01141825, "auxiliary_loss_mlp": 0.01032633, "balance_loss_clip": 1.05122542, "balance_loss_mlp": 1.02522981, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 1.9371950946161733, "language_loss": 0.71124041, "learning_rate": 1.4877563259941433e-06, "loss": 0.73298502, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 2.559952735900879 }, { "auxiliary_loss_clip": 0.01164304, "auxiliary_loss_mlp": 0.01026008, "balance_loss_clip": 1.05129421, "balance_loss_mlp": 1.01818752, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 2.198893873823859, "language_loss": 0.67517966, "learning_rate": 1.4870033762019988e-06, "loss": 0.69708276, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 3.4141390323638916 }, { "auxiliary_loss_clip": 0.01141284, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.04707515, "balance_loss_mlp": 1.02142, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 1.583804077719472, "language_loss": 0.7349515, "learning_rate": 1.4862505042310334e-06, "loss": 0.75665814, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 2.5252623558044434 }, { "auxiliary_loss_clip": 0.01137435, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.04847646, "balance_loss_mlp": 1.02300572, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 1.6381361841671558, "language_loss": 0.69407034, "learning_rate": 1.4854977101954587e-06, "loss": 0.71574718, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.635918140411377 }, { "auxiliary_loss_clip": 0.01157373, "auxiliary_loss_mlp": 0.01027982, "balance_loss_clip": 1.04728937, "balance_loss_mlp": 1.02013826, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 1.8205106592082994, "language_loss": 0.85983253, "learning_rate": 1.4847449942094716e-06, "loss": 0.88168603, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.5185632705688477 }, { "auxiliary_loss_clip": 0.0114235, "auxiliary_loss_mlp": 0.01026679, "balance_loss_clip": 1.04882193, "balance_loss_mlp": 1.01894832, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 3.048823204584803, "language_loss": 0.86300039, "learning_rate": 1.4839923563872598e-06, "loss": 0.88469064, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 4.145396947860718 }, { "auxiliary_loss_clip": 0.01126167, "auxiliary_loss_mlp": 0.01026338, "balance_loss_clip": 1.04654002, "balance_loss_mlp": 1.01912594, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 1.7019358353226637, "language_loss": 0.75969315, "learning_rate": 1.483239796842997e-06, "loss": 0.78121823, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 2.559460401535034 }, { "auxiliary_loss_clip": 0.01127558, "auxiliary_loss_mlp": 0.01025659, "balance_loss_clip": 1.04823875, "balance_loss_mlp": 1.01881075, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 1.6026827236600416, "language_loss": 0.83781356, "learning_rate": 1.4824873156908462e-06, "loss": 0.85934579, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 2.557255268096924 }, { "auxiliary_loss_clip": 0.01159569, "auxiliary_loss_mlp": 0.00761244, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.00013292, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 1.4970469937253992, "language_loss": 0.75319445, "learning_rate": 1.4817349130449584e-06, "loss": 0.77240258, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.527223587036133 }, { "auxiliary_loss_clip": 0.01156108, "auxiliary_loss_mlp": 0.01029058, "balance_loss_clip": 1.05039203, "balance_loss_mlp": 1.02220654, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 3.9239504211231364, "language_loss": 0.82929188, "learning_rate": 1.4809825890194717e-06, "loss": 0.8511436, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.5119004249572754 }, { "auxiliary_loss_clip": 0.01136639, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.04599643, "balance_loss_mlp": 1.02211905, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 1.7245731850533845, "language_loss": 0.77006006, "learning_rate": 1.4802303437285139e-06, "loss": 0.79171413, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.5098612308502197 }, { "auxiliary_loss_clip": 0.01143317, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.04729402, "balance_loss_mlp": 1.02117825, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 2.3946136020528463, "language_loss": 0.8006283, "learning_rate": 1.4794781772861994e-06, "loss": 0.82235503, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.5287961959838867 }, { "auxiliary_loss_clip": 0.01142608, "auxiliary_loss_mlp": 0.00760745, "balance_loss_clip": 1.04820704, "balance_loss_mlp": 1.00014782, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 1.890362889090565, "language_loss": 0.66995776, "learning_rate": 1.4787260898066324e-06, "loss": 0.68899131, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.6273903846740723 }, { "auxiliary_loss_clip": 0.01167735, "auxiliary_loss_mlp": 0.01027823, "balance_loss_clip": 1.05070078, "balance_loss_mlp": 1.02033341, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 1.9634626220985516, "language_loss": 0.86040682, "learning_rate": 1.4779740814039023e-06, "loss": 0.88236243, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 2.498137950897217 }, { "auxiliary_loss_clip": 0.01172877, "auxiliary_loss_mlp": 0.0102209, "balance_loss_clip": 1.05240273, "balance_loss_mlp": 1.01456165, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 1.8253998573973973, "language_loss": 0.68060118, "learning_rate": 1.4772221521920894e-06, "loss": 0.70255083, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.5359127521514893 }, { "auxiliary_loss_clip": 0.01142013, "auxiliary_loss_mlp": 0.01029354, "balance_loss_clip": 1.04916692, "balance_loss_mlp": 1.02213025, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 2.4609260596742106, "language_loss": 0.7405647, "learning_rate": 1.4764703022852598e-06, "loss": 0.76227832, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.5565099716186523 }, { "auxiliary_loss_clip": 0.01090469, "auxiliary_loss_mlp": 0.01026487, "balance_loss_clip": 1.04414678, "balance_loss_mlp": 1.01960576, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 1.7404933431817045, "language_loss": 0.76792175, "learning_rate": 1.4757185317974696e-06, "loss": 0.78909129, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.603670835494995 }, { "auxiliary_loss_clip": 0.01155156, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.04808283, "balance_loss_mlp": 1.01965427, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.0228160139703224, "language_loss": 0.70779198, "learning_rate": 1.474966840842761e-06, "loss": 0.72961152, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.5000007152557373 }, { "auxiliary_loss_clip": 0.01159763, "auxiliary_loss_mlp": 0.01032595, "balance_loss_clip": 1.05043602, "balance_loss_mlp": 1.02515078, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 2.2901797523001735, "language_loss": 0.86947185, "learning_rate": 1.4742152295351655e-06, "loss": 0.89139545, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.5368471145629883 }, { "auxiliary_loss_clip": 0.01157867, "auxiliary_loss_mlp": 0.00761567, "balance_loss_clip": 1.05103004, "balance_loss_mlp": 1.00017929, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 3.2015555167182295, "language_loss": 0.64401215, "learning_rate": 1.4734636979887016e-06, "loss": 0.66320658, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.498842239379883 }, { "auxiliary_loss_clip": 0.01133442, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.04586828, "balance_loss_mlp": 1.02433431, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 2.058990998833509, "language_loss": 0.90192312, "learning_rate": 1.4727122463173755e-06, "loss": 0.92357957, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.5988411903381348 }, { "auxiliary_loss_clip": 0.01144814, "auxiliary_loss_mlp": 0.01026865, "balance_loss_clip": 1.05053282, "balance_loss_mlp": 1.0193069, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 1.8513019254147054, "language_loss": 0.64295685, "learning_rate": 1.471960874635183e-06, "loss": 0.66467363, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 3.3159897327423096 }, { "auxiliary_loss_clip": 0.01139291, "auxiliary_loss_mlp": 0.01027003, "balance_loss_clip": 1.04659724, "balance_loss_mlp": 1.01949906, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 2.02812293871986, "language_loss": 0.70650601, "learning_rate": 1.4712095830561055e-06, "loss": 0.72816896, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.5073142051696777 }, { "auxiliary_loss_clip": 0.0114389, "auxiliary_loss_mlp": 0.01029279, "balance_loss_clip": 1.04644704, "balance_loss_mlp": 1.022138, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 2.5243872349622967, "language_loss": 0.81205362, "learning_rate": 1.4704583716941147e-06, "loss": 0.8337853, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.4951910972595215 }, { "auxiliary_loss_clip": 0.01150524, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.05081582, "balance_loss_mlp": 1.02932584, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 1.8454770498227142, "language_loss": 0.72436142, "learning_rate": 1.4697072406631672e-06, "loss": 0.74623495, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.505258798599243 }, { "auxiliary_loss_clip": 0.01120945, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.04874778, "balance_loss_mlp": 1.02506077, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 1.6355659600866272, "language_loss": 0.72527552, "learning_rate": 1.4689561900772097e-06, "loss": 0.74681044, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 2.650524616241455 }, { "auxiliary_loss_clip": 0.01143368, "auxiliary_loss_mlp": 0.01029942, "balance_loss_clip": 1.0474031, "balance_loss_mlp": 1.0225687, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 2.1245241270444715, "language_loss": 0.72083718, "learning_rate": 1.4682052200501758e-06, "loss": 0.74257028, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 2.5049898624420166 }, { "auxiliary_loss_clip": 0.0116929, "auxiliary_loss_mlp": 0.01028519, "balance_loss_clip": 1.05021095, "balance_loss_mlp": 1.02127731, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 1.7102658490633584, "language_loss": 0.79901171, "learning_rate": 1.4674543306959876e-06, "loss": 0.82098985, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 3.2376315593719482 }, { "auxiliary_loss_clip": 0.01148235, "auxiliary_loss_mlp": 0.01029737, "balance_loss_clip": 1.04861844, "balance_loss_mlp": 1.02203608, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.3119467174979382, "language_loss": 0.84080279, "learning_rate": 1.4667035221285535e-06, "loss": 0.86258256, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 2.516244888305664 }, { "auxiliary_loss_clip": 0.01153378, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.05044377, "balance_loss_mlp": 1.0174613, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 1.9461037709054763, "language_loss": 0.74233413, "learning_rate": 1.4659527944617715e-06, "loss": 0.76411569, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.5352916717529297 }, { "auxiliary_loss_clip": 0.01095291, "auxiliary_loss_mlp": 0.01030104, "balance_loss_clip": 1.04244733, "balance_loss_mlp": 1.02262056, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 1.6866192473436747, "language_loss": 0.76199692, "learning_rate": 1.465202147809526e-06, "loss": 0.78325087, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 3.4044220447540283 }, { "auxiliary_loss_clip": 0.0117308, "auxiliary_loss_mlp": 0.01027398, "balance_loss_clip": 1.05270672, "balance_loss_mlp": 1.02005494, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 1.919778392226231, "language_loss": 0.76026136, "learning_rate": 1.4644515822856888e-06, "loss": 0.78226614, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 3.2520570755004883 }, { "auxiliary_loss_clip": 0.01032236, "auxiliary_loss_mlp": 0.01003381, "balance_loss_clip": 1.01329887, "balance_loss_mlp": 1.0023793, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.7592408506814904, "language_loss": 0.56544816, "learning_rate": 1.4637010980041215e-06, "loss": 0.58580428, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.1571218967437744 }, { "auxiliary_loss_clip": 0.01172562, "auxiliary_loss_mlp": 0.01025678, "balance_loss_clip": 1.05263638, "balance_loss_mlp": 1.01809669, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 2.063031892929405, "language_loss": 0.89316475, "learning_rate": 1.4629506950786707e-06, "loss": 0.91514719, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 2.464160442352295 }, { "auxiliary_loss_clip": 0.01063521, "auxiliary_loss_mlp": 0.01002464, "balance_loss_clip": 1.01471531, "balance_loss_mlp": 1.00145102, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 1.1930710231793182, "language_loss": 0.56083143, "learning_rate": 1.4622003736231733e-06, "loss": 0.58149129, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.131883144378662 }, { "auxiliary_loss_clip": 0.01152956, "auxiliary_loss_mlp": 0.01037829, "balance_loss_clip": 1.04883218, "balance_loss_mlp": 1.03052187, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 1.7603372419566368, "language_loss": 0.80887842, "learning_rate": 1.461450133751451e-06, "loss": 0.83078635, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.450629711151123 }, { "auxiliary_loss_clip": 0.01156172, "auxiliary_loss_mlp": 0.01028753, "balance_loss_clip": 1.04892278, "balance_loss_mlp": 1.02164268, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 1.7210128952502044, "language_loss": 0.75731546, "learning_rate": 1.4606999755773153e-06, "loss": 0.77916467, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.5277669429779053 }, { "auxiliary_loss_clip": 0.01172326, "auxiliary_loss_mlp": 0.01025991, "balance_loss_clip": 1.05404556, "balance_loss_mlp": 1.01892805, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 2.083681758306004, "language_loss": 0.81989276, "learning_rate": 1.4599498992145643e-06, "loss": 0.84187591, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.4487643241882324 }, { "auxiliary_loss_clip": 0.01149392, "auxiliary_loss_mlp": 0.00760888, "balance_loss_clip": 1.05087125, "balance_loss_mlp": 1.00016153, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 2.028073502215373, "language_loss": 0.70679224, "learning_rate": 1.4591999047769846e-06, "loss": 0.72589505, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.5288710594177246 }, { "auxiliary_loss_clip": 0.01097341, "auxiliary_loss_mlp": 0.01027345, "balance_loss_clip": 1.0415684, "balance_loss_mlp": 1.02018321, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 2.382431305698634, "language_loss": 0.75121605, "learning_rate": 1.4584499923783486e-06, "loss": 0.77246296, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 2.586296796798706 }, { "auxiliary_loss_clip": 0.01142265, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.04800951, "balance_loss_mlp": 1.02257431, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 1.7222629669012672, "language_loss": 0.76159251, "learning_rate": 1.457700162132419e-06, "loss": 0.78331107, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.5300979614257812 }, { "auxiliary_loss_clip": 0.01112994, "auxiliary_loss_mlp": 0.01032018, "balance_loss_clip": 1.04701173, "balance_loss_mlp": 1.02472198, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 1.9571896677795666, "language_loss": 0.72550428, "learning_rate": 1.4569504141529433e-06, "loss": 0.74695438, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.648343324661255 }, { "auxiliary_loss_clip": 0.01152732, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.04880786, "balance_loss_mlp": 1.02306938, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 2.2076063931375547, "language_loss": 0.71788228, "learning_rate": 1.456200748553658e-06, "loss": 0.73971117, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.5250959396362305 }, { "auxiliary_loss_clip": 0.01173983, "auxiliary_loss_mlp": 0.0103102, "balance_loss_clip": 1.05403042, "balance_loss_mlp": 1.02368569, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 1.521462865616661, "language_loss": 0.78798807, "learning_rate": 1.455451165448287e-06, "loss": 0.81003809, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.5222954750061035 }, { "auxiliary_loss_clip": 0.01140372, "auxiliary_loss_mlp": 0.01028986, "balance_loss_clip": 1.04875445, "balance_loss_mlp": 1.02141857, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.2631740119076538, "language_loss": 0.73336339, "learning_rate": 1.4547016649505407e-06, "loss": 0.75505698, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.541682720184326 }, { "auxiliary_loss_clip": 0.0112388, "auxiliary_loss_mlp": 0.01029327, "balance_loss_clip": 1.04208529, "balance_loss_mlp": 1.02195406, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 2.330119454792163, "language_loss": 0.85161555, "learning_rate": 1.4539522471741193e-06, "loss": 0.87314761, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 2.5848255157470703 }, { "auxiliary_loss_clip": 0.01160248, "auxiliary_loss_mlp": 0.01031748, "balance_loss_clip": 1.04946947, "balance_loss_mlp": 1.02332568, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 2.1527565979297476, "language_loss": 0.70737916, "learning_rate": 1.4532029122327067e-06, "loss": 0.72929907, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.4901325702667236 }, { "auxiliary_loss_clip": 0.01124907, "auxiliary_loss_mlp": 0.0102944, "balance_loss_clip": 1.04954314, "balance_loss_mlp": 1.02223945, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 1.8854270442171732, "language_loss": 0.75616729, "learning_rate": 1.4524536602399783e-06, "loss": 0.7777108, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 3.395029067993164 }, { "auxiliary_loss_clip": 0.01140542, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.05063426, "balance_loss_mlp": 1.0244863, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 1.5318933953903326, "language_loss": 0.77445042, "learning_rate": 1.4517044913095938e-06, "loss": 0.79617155, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.5938239097595215 }, { "auxiliary_loss_clip": 0.01160631, "auxiliary_loss_mlp": 0.01027004, "balance_loss_clip": 1.05207086, "balance_loss_mlp": 1.01940131, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 1.8716186995571915, "language_loss": 0.81571144, "learning_rate": 1.4509554055552022e-06, "loss": 0.83758777, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.545639991760254 }, { "auxiliary_loss_clip": 0.01140713, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.04765427, "balance_loss_mlp": 1.02289057, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 2.724066863539614, "language_loss": 0.84136361, "learning_rate": 1.450206403090439e-06, "loss": 0.86307502, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 2.5296237468719482 }, { "auxiliary_loss_clip": 0.01155399, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.05153394, "balance_loss_mlp": 1.02196836, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 2.2014417986606305, "language_loss": 0.86197782, "learning_rate": 1.4494574840289274e-06, "loss": 0.883829, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.4913082122802734 }, { "auxiliary_loss_clip": 0.01161842, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.04896176, "balance_loss_mlp": 1.02065229, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 1.626210647302647, "language_loss": 0.73897898, "learning_rate": 1.4487086484842782e-06, "loss": 0.76088357, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 3.3269667625427246 }, { "auxiliary_loss_clip": 0.01172894, "auxiliary_loss_mlp": 0.01025834, "balance_loss_clip": 1.05255353, "balance_loss_mlp": 1.01861334, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 2.074818950457689, "language_loss": 0.60172033, "learning_rate": 1.4479598965700878e-06, "loss": 0.62370765, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 2.4390101432800293 }, { "auxiliary_loss_clip": 0.0112751, "auxiliary_loss_mlp": 0.01032741, "balance_loss_clip": 1.04602087, "balance_loss_mlp": 1.02540064, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 2.260677376856249, "language_loss": 0.690153, "learning_rate": 1.4472112283999427e-06, "loss": 0.71175551, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.572288990020752 }, { "auxiliary_loss_clip": 0.0115176, "auxiliary_loss_mlp": 0.01030213, "balance_loss_clip": 1.04986107, "balance_loss_mlp": 1.0231235, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 1.8766889377158704, "language_loss": 0.69073755, "learning_rate": 1.4464626440874143e-06, "loss": 0.71255732, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 3.376095771789551 }, { "auxiliary_loss_clip": 0.01118402, "auxiliary_loss_mlp": 0.01026422, "balance_loss_clip": 1.04218197, "balance_loss_mlp": 1.01850653, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 2.592184211996956, "language_loss": 0.74434882, "learning_rate": 1.4457141437460636e-06, "loss": 0.76579702, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.562472105026245 }, { "auxiliary_loss_clip": 0.0114437, "auxiliary_loss_mlp": 0.01028972, "balance_loss_clip": 1.049739, "balance_loss_mlp": 1.02103555, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 1.6987704466959332, "language_loss": 0.73268712, "learning_rate": 1.444965727489436e-06, "loss": 0.75442052, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 3.356118679046631 }, { "auxiliary_loss_clip": 0.0112791, "auxiliary_loss_mlp": 0.0103137, "balance_loss_clip": 1.04429018, "balance_loss_mlp": 1.02459025, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 2.342467979886596, "language_loss": 0.63484031, "learning_rate": 1.444217395431066e-06, "loss": 0.65643311, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 2.6083390712738037 }, { "auxiliary_loss_clip": 0.01031577, "auxiliary_loss_mlp": 0.01001203, "balance_loss_clip": 1.01548362, "balance_loss_mlp": 1.00007617, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.8050056976344008, "language_loss": 0.55850816, "learning_rate": 1.4434691476844755e-06, "loss": 0.57883596, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.128286838531494 }, { "auxiliary_loss_clip": 0.01141738, "auxiliary_loss_mlp": 0.0102828, "balance_loss_clip": 1.04991913, "balance_loss_mlp": 1.02120495, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 2.2594764321591883, "language_loss": 0.66796839, "learning_rate": 1.4427209843631729e-06, "loss": 0.6896686, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 2.5236167907714844 }, { "auxiliary_loss_clip": 0.0116872, "auxiliary_loss_mlp": 0.00760807, "balance_loss_clip": 1.05076206, "balance_loss_mlp": 1.00015819, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 2.000628927777027, "language_loss": 0.80750167, "learning_rate": 1.4419729055806534e-06, "loss": 0.82679695, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.5175673961639404 }, { "auxiliary_loss_clip": 0.01141703, "auxiliary_loss_mlp": 0.00761072, "balance_loss_clip": 1.05177784, "balance_loss_mlp": 1.00015187, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 1.7071341725266582, "language_loss": 0.81984186, "learning_rate": 1.441224911450401e-06, "loss": 0.83886969, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.5113325119018555 }, { "auxiliary_loss_clip": 0.01162352, "auxiliary_loss_mlp": 0.01029791, "balance_loss_clip": 1.05160356, "balance_loss_mlp": 1.0223465, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 1.6305472870591722, "language_loss": 0.82173908, "learning_rate": 1.4404770020858851e-06, "loss": 0.84366059, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.527385711669922 }, { "auxiliary_loss_clip": 0.01150365, "auxiliary_loss_mlp": 0.01023982, "balance_loss_clip": 1.04833531, "balance_loss_mlp": 1.01695442, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 1.7352037619373997, "language_loss": 0.85877842, "learning_rate": 1.439729177600563e-06, "loss": 0.88052189, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 2.5252771377563477 }, { "auxiliary_loss_clip": 0.01158304, "auxiliary_loss_mlp": 0.01027159, "balance_loss_clip": 1.05130601, "balance_loss_mlp": 1.01963401, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 1.9211105861191367, "language_loss": 0.72947651, "learning_rate": 1.4389814381078793e-06, "loss": 0.75133115, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.5023303031921387 }, { "auxiliary_loss_clip": 0.0106547, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.04346442, "balance_loss_mlp": 1.0284673, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 1.913315598122271, "language_loss": 0.80133998, "learning_rate": 1.438233783721265e-06, "loss": 0.82235539, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 2.9828877449035645 }, { "auxiliary_loss_clip": 0.01139291, "auxiliary_loss_mlp": 0.01032058, "balance_loss_clip": 1.05010903, "balance_loss_mlp": 1.02455056, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 1.8961866703507466, "language_loss": 0.78032362, "learning_rate": 1.43748621455414e-06, "loss": 0.80203712, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 2.6738622188568115 }, { "auxiliary_loss_clip": 0.01140652, "auxiliary_loss_mlp": 0.01027358, "balance_loss_clip": 1.04843664, "balance_loss_mlp": 1.01984787, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 2.362249084316124, "language_loss": 0.80117351, "learning_rate": 1.4367387307199082e-06, "loss": 0.82285357, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.5071182250976562 }, { "auxiliary_loss_clip": 0.01149982, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.04597187, "balance_loss_mlp": 1.02495289, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 3.1658959208693584, "language_loss": 0.82427037, "learning_rate": 1.4359913323319632e-06, "loss": 0.84609193, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.473132848739624 }, { "auxiliary_loss_clip": 0.01091245, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.04056907, "balance_loss_mlp": 1.02199078, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 1.9468612562311294, "language_loss": 0.77517068, "learning_rate": 1.4352440195036847e-06, "loss": 0.79637748, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 2.657898187637329 }, { "auxiliary_loss_clip": 0.01088682, "auxiliary_loss_mlp": 0.01025708, "balance_loss_clip": 1.03953481, "balance_loss_mlp": 1.01827502, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 1.553084400981856, "language_loss": 0.79962242, "learning_rate": 1.4344967923484395e-06, "loss": 0.82076627, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.7157187461853027 }, { "auxiliary_loss_clip": 0.01154169, "auxiliary_loss_mlp": 0.0102469, "balance_loss_clip": 1.04833627, "balance_loss_mlp": 1.01684618, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.11648332857497, "language_loss": 0.71973741, "learning_rate": 1.433749650979581e-06, "loss": 0.74152601, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 3.544306993484497 }, { "auxiliary_loss_clip": 0.01130646, "auxiliary_loss_mlp": 0.01026528, "balance_loss_clip": 1.04597783, "balance_loss_mlp": 1.01919675, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 1.884362125407266, "language_loss": 0.68122274, "learning_rate": 1.433002595510451e-06, "loss": 0.70279443, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 2.6176869869232178 }, { "auxiliary_loss_clip": 0.01139528, "auxiliary_loss_mlp": 0.00761298, "balance_loss_clip": 1.04784632, "balance_loss_mlp": 1.00017428, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 1.8691836255799785, "language_loss": 0.72049999, "learning_rate": 1.4322556260543757e-06, "loss": 0.73950827, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.5375945568084717 }, { "auxiliary_loss_clip": 0.0103335, "auxiliary_loss_mlp": 0.01002663, "balance_loss_clip": 1.01370335, "balance_loss_mlp": 1.00154209, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 0.8912829236257772, "language_loss": 0.62697852, "learning_rate": 1.4315087427246703e-06, "loss": 0.64733863, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.0460522174835205 }, { "auxiliary_loss_clip": 0.01063874, "auxiliary_loss_mlp": 0.01000174, "balance_loss_clip": 1.01485348, "balance_loss_mlp": 0.9990595, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 0.8752576712014443, "language_loss": 0.58510673, "learning_rate": 1.4307619456346372e-06, "loss": 0.6057471, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 2.781691312789917 }, { "auxiliary_loss_clip": 0.0115599, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.04696679, "balance_loss_mlp": 1.02350175, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 1.8392660669120915, "language_loss": 0.74428624, "learning_rate": 1.430015234897564e-06, "loss": 0.76615828, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.6645851135253906 }, { "auxiliary_loss_clip": 0.01168098, "auxiliary_loss_mlp": 0.00760665, "balance_loss_clip": 1.04810953, "balance_loss_mlp": 1.00020099, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 1.9419058352399683, "language_loss": 0.66698223, "learning_rate": 1.4292686106267274e-06, "loss": 0.68626982, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 2.6779115200042725 }, { "auxiliary_loss_clip": 0.01161443, "auxiliary_loss_mlp": 0.01029924, "balance_loss_clip": 1.05058587, "balance_loss_mlp": 1.0225209, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 1.7201349394366874, "language_loss": 0.76950133, "learning_rate": 1.4285220729353876e-06, "loss": 0.79141498, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 3.263826370239258 }, { "auxiliary_loss_clip": 0.01142944, "auxiliary_loss_mlp": 0.01028721, "balance_loss_clip": 1.04552758, "balance_loss_mlp": 1.0215416, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 1.9173193395582409, "language_loss": 0.77169096, "learning_rate": 1.4277756219367957e-06, "loss": 0.79340756, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.5185365676879883 }, { "auxiliary_loss_clip": 0.01134698, "auxiliary_loss_mlp": 0.01028397, "balance_loss_clip": 1.04797542, "balance_loss_mlp": 1.02133954, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 2.373948950005817, "language_loss": 0.79629922, "learning_rate": 1.4270292577441864e-06, "loss": 0.8179301, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 3.6376543045043945 }, { "auxiliary_loss_clip": 0.0115865, "auxiliary_loss_mlp": 0.01026968, "balance_loss_clip": 1.04741406, "balance_loss_mlp": 1.01924968, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.734029427462117, "language_loss": 0.71954924, "learning_rate": 1.4262829804707836e-06, "loss": 0.74140537, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.5502326488494873 }, { "auxiliary_loss_clip": 0.01156592, "auxiliary_loss_mlp": 0.01031136, "balance_loss_clip": 1.04770315, "balance_loss_mlp": 1.02317882, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 1.4065751727777187, "language_loss": 0.69818598, "learning_rate": 1.4255367902297958e-06, "loss": 0.72006327, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 3.2819416522979736 }, { "auxiliary_loss_clip": 0.01169679, "auxiliary_loss_mlp": 0.01026393, "balance_loss_clip": 1.05160785, "balance_loss_mlp": 1.01915073, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.0407381611507884, "language_loss": 0.78800255, "learning_rate": 1.4247906871344215e-06, "loss": 0.80996323, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 2.4371049404144287 }, { "auxiliary_loss_clip": 0.01137004, "auxiliary_loss_mlp": 0.01027924, "balance_loss_clip": 1.0445894, "balance_loss_mlp": 1.02084279, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 2.0528535245311135, "language_loss": 0.74954277, "learning_rate": 1.4240446712978415e-06, "loss": 0.77119207, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.5370421409606934 }, { "auxiliary_loss_clip": 0.01160855, "auxiliary_loss_mlp": 0.01028043, "balance_loss_clip": 1.04959178, "balance_loss_mlp": 1.02087569, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 1.8582921908899248, "language_loss": 0.74431491, "learning_rate": 1.423298742833227e-06, "loss": 0.76620388, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 2.554076671600342 }, { "auxiliary_loss_clip": 0.01133233, "auxiliary_loss_mlp": 0.01026169, "balance_loss_clip": 1.04476261, "balance_loss_mlp": 1.01834321, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 1.9074758464865238, "language_loss": 0.71623856, "learning_rate": 1.4225529018537352e-06, "loss": 0.73783255, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.5255210399627686 }, { "auxiliary_loss_clip": 0.01170124, "auxiliary_loss_mlp": 0.01029439, "balance_loss_clip": 1.05102158, "balance_loss_mlp": 1.02208924, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 1.5682219295038988, "language_loss": 0.7759645, "learning_rate": 1.4218071484725082e-06, "loss": 0.7979601, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.5155751705169678 }, { "auxiliary_loss_clip": 0.01140313, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.04978085, "balance_loss_mlp": 1.02194381, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 1.8116777868601297, "language_loss": 0.76052415, "learning_rate": 1.4210614828026786e-06, "loss": 0.78222573, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.5189146995544434 }, { "auxiliary_loss_clip": 0.01168178, "auxiliary_loss_mlp": 0.01027302, "balance_loss_clip": 1.04875851, "balance_loss_mlp": 1.01997662, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 1.4431975136755333, "language_loss": 0.74256539, "learning_rate": 1.4203159049573605e-06, "loss": 0.76452017, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 2.492875337600708 }, { "auxiliary_loss_clip": 0.01144783, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.04657054, "balance_loss_mlp": 1.02422619, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 2.1016629590022644, "language_loss": 0.86850607, "learning_rate": 1.4195704150496593e-06, "loss": 0.89027572, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.5013492107391357 }, { "auxiliary_loss_clip": 0.01141825, "auxiliary_loss_mlp": 0.01031541, "balance_loss_clip": 1.04902124, "balance_loss_mlp": 1.02423906, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 2.3025083398920865, "language_loss": 0.74200451, "learning_rate": 1.4188250131926639e-06, "loss": 0.76373816, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.5423552989959717 }, { "auxiliary_loss_clip": 0.01142352, "auxiliary_loss_mlp": 0.01028038, "balance_loss_clip": 1.04623497, "balance_loss_mlp": 1.0202837, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 1.787963858499802, "language_loss": 0.80430949, "learning_rate": 1.4180796994994525e-06, "loss": 0.82601345, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.4954183101654053 }, { "auxiliary_loss_clip": 0.01141024, "auxiliary_loss_mlp": 0.01024772, "balance_loss_clip": 1.04632294, "balance_loss_mlp": 1.01767302, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.928442226117961, "language_loss": 0.71961474, "learning_rate": 1.4173344740830877e-06, "loss": 0.74127269, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.5367302894592285 }, { "auxiliary_loss_clip": 0.01133586, "auxiliary_loss_mlp": 0.01032335, "balance_loss_clip": 1.0463922, "balance_loss_mlp": 1.02470541, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 1.626895413962969, "language_loss": 0.70770895, "learning_rate": 1.4165893370566206e-06, "loss": 0.72936809, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.680866003036499 }, { "auxiliary_loss_clip": 0.0115119, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.0475806, "balance_loss_mlp": 1.0240705, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 1.671285411578337, "language_loss": 0.77359647, "learning_rate": 1.4158442885330865e-06, "loss": 0.79542571, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.517709255218506 }, { "auxiliary_loss_clip": 0.01148676, "auxiliary_loss_mlp": 0.01027954, "balance_loss_clip": 1.046296, "balance_loss_mlp": 1.02058697, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 2.2595020639729397, "language_loss": 0.78854567, "learning_rate": 1.4150993286255094e-06, "loss": 0.81031191, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.5145297050476074 }, { "auxiliary_loss_clip": 0.01168425, "auxiliary_loss_mlp": 0.01025041, "balance_loss_clip": 1.04864192, "balance_loss_mlp": 1.01819873, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 2.3081722415663832, "language_loss": 0.7969625, "learning_rate": 1.4143544574468993e-06, "loss": 0.81889725, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 3.2486488819122314 }, { "auxiliary_loss_clip": 0.01153536, "auxiliary_loss_mlp": 0.01023674, "balance_loss_clip": 1.04908299, "balance_loss_mlp": 1.01647675, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 1.7980539147425736, "language_loss": 0.82365543, "learning_rate": 1.4136096751102523e-06, "loss": 0.84542757, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.4696905612945557 }, { "auxiliary_loss_clip": 0.01145681, "auxiliary_loss_mlp": 0.01033755, "balance_loss_clip": 1.04978609, "balance_loss_mlp": 1.02615893, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.2578256745616905, "language_loss": 0.83113116, "learning_rate": 1.4128649817285516e-06, "loss": 0.85292554, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.5480709075927734 }, { "auxiliary_loss_clip": 0.01144217, "auxiliary_loss_mlp": 0.01037267, "balance_loss_clip": 1.0462687, "balance_loss_mlp": 1.02821898, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 2.067967429811361, "language_loss": 0.63019472, "learning_rate": 1.412120377414766e-06, "loss": 0.65200955, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.569143295288086 }, { "auxiliary_loss_clip": 0.01169189, "auxiliary_loss_mlp": 0.01028863, "balance_loss_clip": 1.05025578, "balance_loss_mlp": 1.02163839, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 1.5601169010285756, "language_loss": 0.71248406, "learning_rate": 1.4113758622818522e-06, "loss": 0.73446453, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 2.473106622695923 }, { "auxiliary_loss_clip": 0.01147405, "auxiliary_loss_mlp": 0.00760056, "balance_loss_clip": 1.05008459, "balance_loss_mlp": 1.00016809, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 2.0102643311803594, "language_loss": 0.83410805, "learning_rate": 1.410631436442751e-06, "loss": 0.85318267, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 2.5103046894073486 }, { "auxiliary_loss_clip": 0.01162416, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.0504868, "balance_loss_mlp": 1.02297306, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 2.1185557217760107, "language_loss": 0.8626864, "learning_rate": 1.4098871000103936e-06, "loss": 0.88461822, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 3.259561061859131 }, { "auxiliary_loss_clip": 0.01141407, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.04696882, "balance_loss_mlp": 1.02248442, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 1.7385177174217754, "language_loss": 0.8265267, "learning_rate": 1.409142853097693e-06, "loss": 0.84823829, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 2.521294116973877 }, { "auxiliary_loss_clip": 0.01145355, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.04974174, "balance_loss_mlp": 1.02527738, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 1.8619629891851506, "language_loss": 0.7926302, "learning_rate": 1.408398695817553e-06, "loss": 0.81441194, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.537020206451416 }, { "auxiliary_loss_clip": 0.01142889, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.04665685, "balance_loss_mlp": 1.02068591, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 1.746642085765271, "language_loss": 0.69873393, "learning_rate": 1.4076546282828593e-06, "loss": 0.72045147, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.572387933731079 }, { "auxiliary_loss_clip": 0.01144373, "auxiliary_loss_mlp": 0.01024083, "balance_loss_clip": 1.0441277, "balance_loss_mlp": 1.01620579, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 2.2600734127843074, "language_loss": 0.66149127, "learning_rate": 1.4069106506064874e-06, "loss": 0.6831758, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 4.191065073013306 }, { "auxiliary_loss_clip": 0.01138103, "auxiliary_loss_mlp": 0.01023523, "balance_loss_clip": 1.04938555, "balance_loss_mlp": 1.01637018, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 1.7104498192279372, "language_loss": 0.78463447, "learning_rate": 1.4061667629012989e-06, "loss": 0.80625081, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.570335865020752 }, { "auxiliary_loss_clip": 0.01134633, "auxiliary_loss_mlp": 0.01028496, "balance_loss_clip": 1.04867816, "balance_loss_mlp": 1.02150989, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 2.2961106713848607, "language_loss": 0.83135146, "learning_rate": 1.40542296528014e-06, "loss": 0.85298276, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 2.5524842739105225 }, { "auxiliary_loss_clip": 0.01155536, "auxiliary_loss_mlp": 0.01030942, "balance_loss_clip": 1.04722738, "balance_loss_mlp": 1.02319336, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 1.9347432085659586, "language_loss": 0.76178789, "learning_rate": 1.4046792578558452e-06, "loss": 0.78365266, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 2.5003468990325928 }, { "auxiliary_loss_clip": 0.01136476, "auxiliary_loss_mlp": 0.0103122, "balance_loss_clip": 1.0461483, "balance_loss_mlp": 1.02392411, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 3.2171644880225685, "language_loss": 0.75511885, "learning_rate": 1.4039356407412325e-06, "loss": 0.7767958, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.4715845584869385 }, { "auxiliary_loss_clip": 0.01057504, "auxiliary_loss_mlp": 0.01006219, "balance_loss_clip": 1.01745939, "balance_loss_mlp": 1.00492573, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7831537492150985, "language_loss": 0.57146549, "learning_rate": 1.40319211404911e-06, "loss": 0.59210277, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.066530227661133 }, { "auxiliary_loss_clip": 0.01171089, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.05088449, "balance_loss_mlp": 1.02378893, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 1.6985727293282236, "language_loss": 0.90548825, "learning_rate": 1.4024486778922691e-06, "loss": 0.92750871, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.544379472732544 }, { "auxiliary_loss_clip": 0.0114634, "auxiliary_loss_mlp": 0.01028583, "balance_loss_clip": 1.04569292, "balance_loss_mlp": 1.02160597, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 1.9023240487785429, "language_loss": 0.76764518, "learning_rate": 1.4017053323834884e-06, "loss": 0.78939438, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.530339002609253 }, { "auxiliary_loss_clip": 0.01142137, "auxiliary_loss_mlp": 0.01027493, "balance_loss_clip": 1.04512393, "balance_loss_mlp": 1.02019775, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 2.0078600141272247, "language_loss": 0.76283896, "learning_rate": 1.4009620776355333e-06, "loss": 0.78453529, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 2.5701327323913574 }, { "auxiliary_loss_clip": 0.01153238, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.04729366, "balance_loss_mlp": 1.02723002, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 1.756782841512221, "language_loss": 0.79081744, "learning_rate": 1.4002189137611553e-06, "loss": 0.81270373, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.532048225402832 }, { "auxiliary_loss_clip": 0.01153278, "auxiliary_loss_mlp": 0.01033108, "balance_loss_clip": 1.04774785, "balance_loss_mlp": 1.02531528, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 2.8675355436094887, "language_loss": 0.69467521, "learning_rate": 1.3994758408730901e-06, "loss": 0.71653914, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.525874614715576 }, { "auxiliary_loss_clip": 0.01143515, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.05069149, "balance_loss_mlp": 1.01947665, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 1.8470910208855524, "language_loss": 0.76441479, "learning_rate": 1.3987328590840629e-06, "loss": 0.78611743, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.63199782371521 }, { "auxiliary_loss_clip": 0.01150765, "auxiliary_loss_mlp": 0.01025956, "balance_loss_clip": 1.04621947, "balance_loss_mlp": 1.0185864, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 1.721551124570284, "language_loss": 0.85964131, "learning_rate": 1.397989968506783e-06, "loss": 0.88140857, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.5091867446899414 }, { "auxiliary_loss_clip": 0.01175781, "auxiliary_loss_mlp": 0.01030597, "balance_loss_clip": 1.05303454, "balance_loss_mlp": 1.02341473, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 2.109023382822166, "language_loss": 0.72671509, "learning_rate": 1.3972471692539458e-06, "loss": 0.74877894, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.4198598861694336 }, { "auxiliary_loss_clip": 0.01139239, "auxiliary_loss_mlp": 0.01030885, "balance_loss_clip": 1.04768944, "balance_loss_mlp": 1.02317774, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 3.4164571385206086, "language_loss": 0.75351608, "learning_rate": 1.3965044614382348e-06, "loss": 0.77521729, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.4764938354492188 }, { "auxiliary_loss_clip": 0.01173107, "auxiliary_loss_mlp": 0.01027738, "balance_loss_clip": 1.05251002, "balance_loss_mlp": 1.01969767, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 2.487989062878312, "language_loss": 0.75452071, "learning_rate": 1.3957618451723162e-06, "loss": 0.77652919, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.442556619644165 }, { "auxiliary_loss_clip": 0.01142018, "auxiliary_loss_mlp": 0.01032854, "balance_loss_clip": 1.04707325, "balance_loss_mlp": 1.02551961, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 1.8373054300509908, "language_loss": 0.7177304, "learning_rate": 1.3950193205688457e-06, "loss": 0.73947912, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 3.3874645233154297 }, { "auxiliary_loss_clip": 0.01138344, "auxiliary_loss_mlp": 0.0103031, "balance_loss_clip": 1.04713941, "balance_loss_mlp": 1.02354479, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 1.7461668814778766, "language_loss": 0.83456469, "learning_rate": 1.3942768877404627e-06, "loss": 0.85625124, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.497920513153076 }, { "auxiliary_loss_clip": 0.0116795, "auxiliary_loss_mlp": 0.01027124, "balance_loss_clip": 1.04798901, "balance_loss_mlp": 1.01939309, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 1.9610123389150957, "language_loss": 0.73571491, "learning_rate": 1.393534546799795e-06, "loss": 0.75766563, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.49157452583313 }, { "auxiliary_loss_clip": 0.01134555, "auxiliary_loss_mlp": 0.01027311, "balance_loss_clip": 1.04720521, "balance_loss_mlp": 1.01961327, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 1.5186531643137702, "language_loss": 0.67445815, "learning_rate": 1.3927922978594536e-06, "loss": 0.69607681, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.561363935470581 }, { "auxiliary_loss_clip": 0.01053269, "auxiliary_loss_mlp": 0.01003398, "balance_loss_clip": 1.01615739, "balance_loss_mlp": 1.00221777, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.768844702105615, "language_loss": 0.57489496, "learning_rate": 1.3920501410320387e-06, "loss": 0.59546161, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 3.041224479675293 }, { "auxiliary_loss_clip": 0.0114142, "auxiliary_loss_mlp": 0.01026004, "balance_loss_clip": 1.045928, "balance_loss_mlp": 1.01832736, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 2.1129162305171776, "language_loss": 0.76131272, "learning_rate": 1.3913080764301333e-06, "loss": 0.78298688, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 2.508819818496704 }, { "auxiliary_loss_clip": 0.01122067, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.04278874, "balance_loss_mlp": 1.02432382, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 1.7692549118955077, "language_loss": 0.71272635, "learning_rate": 1.3905661041663085e-06, "loss": 0.73426092, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 3.3891961574554443 }, { "auxiliary_loss_clip": 0.01156254, "auxiliary_loss_mlp": 0.01030824, "balance_loss_clip": 1.04969811, "balance_loss_mlp": 1.02197862, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 1.9947400898472356, "language_loss": 0.65335715, "learning_rate": 1.389824224353122e-06, "loss": 0.67522788, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 2.609548330307007 }, { "auxiliary_loss_clip": 0.01154684, "auxiliary_loss_mlp": 0.0102778, "balance_loss_clip": 1.04941726, "balance_loss_mlp": 1.02053213, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 1.4812910735915281, "language_loss": 0.76813293, "learning_rate": 1.389082437103115e-06, "loss": 0.78995752, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.5400121212005615 }, { "auxiliary_loss_clip": 0.01125895, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.04383016, "balance_loss_mlp": 1.02219057, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 1.7562643721522815, "language_loss": 0.77691829, "learning_rate": 1.3883407425288172e-06, "loss": 0.7984786, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 3.431467056274414 }, { "auxiliary_loss_clip": 0.01138565, "auxiliary_loss_mlp": 0.01030433, "balance_loss_clip": 1.04603624, "balance_loss_mlp": 1.02266622, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 2.4662909038627836, "language_loss": 0.79948795, "learning_rate": 1.3875991407427417e-06, "loss": 0.8211779, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 3.2683560848236084 }, { "auxiliary_loss_clip": 0.01035737, "auxiliary_loss_mlp": 0.01000541, "balance_loss_clip": 1.01433074, "balance_loss_mlp": 0.99934906, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.7748458364660881, "language_loss": 0.58270597, "learning_rate": 1.38685763185739e-06, "loss": 0.60306877, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.173628568649292 }, { "auxiliary_loss_clip": 0.01167222, "auxiliary_loss_mlp": 0.01032243, "balance_loss_clip": 1.04969764, "balance_loss_mlp": 1.02467048, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 2.3920738268788853, "language_loss": 0.67444587, "learning_rate": 1.3861162159852476e-06, "loss": 0.69644052, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 2.4421184062957764 }, { "auxiliary_loss_clip": 0.01145765, "auxiliary_loss_mlp": 0.01028204, "balance_loss_clip": 1.0475322, "balance_loss_mlp": 1.02092934, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 1.7998744903246418, "language_loss": 0.79485965, "learning_rate": 1.3853748932387875e-06, "loss": 0.81659937, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.547948122024536 }, { "auxiliary_loss_clip": 0.01131295, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.04632354, "balance_loss_mlp": 1.02004313, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 2.328699132984908, "language_loss": 0.75204015, "learning_rate": 1.3846336637304671e-06, "loss": 0.77363205, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 2.5320703983306885 }, { "auxiliary_loss_clip": 0.01130793, "auxiliary_loss_mlp": 0.01024551, "balance_loss_clip": 1.04467821, "balance_loss_mlp": 1.01725221, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 2.187169061554951, "language_loss": 0.8331092, "learning_rate": 1.3838925275727316e-06, "loss": 0.85466266, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.540968418121338 }, { "auxiliary_loss_clip": 0.011714, "auxiliary_loss_mlp": 0.01025289, "balance_loss_clip": 1.05166662, "balance_loss_mlp": 1.01795816, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 1.7073873585641228, "language_loss": 0.78776324, "learning_rate": 1.3831514848780089e-06, "loss": 0.80973011, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.4484424591064453 }, { "auxiliary_loss_clip": 0.0114915, "auxiliary_loss_mlp": 0.01026421, "balance_loss_clip": 1.04802632, "balance_loss_mlp": 1.01931572, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 2.2447062906859205, "language_loss": 0.92023253, "learning_rate": 1.3824105357587152e-06, "loss": 0.94198823, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.446146011352539 }, { "auxiliary_loss_clip": 0.01138244, "auxiliary_loss_mlp": 0.01025289, "balance_loss_clip": 1.04691768, "balance_loss_mlp": 1.01852679, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 1.4901028536063987, "language_loss": 0.82544166, "learning_rate": 1.381669680327253e-06, "loss": 0.84707701, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 2.5735507011413574 }, { "auxiliary_loss_clip": 0.01135288, "auxiliary_loss_mlp": 0.01027187, "balance_loss_clip": 1.0489527, "balance_loss_mlp": 1.01972747, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 1.9017451731346935, "language_loss": 0.70652163, "learning_rate": 1.380928918696008e-06, "loss": 0.72814631, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.5679984092712402 }, { "auxiliary_loss_clip": 0.0115316, "auxiliary_loss_mlp": 0.01025495, "balance_loss_clip": 1.04807949, "balance_loss_mlp": 1.0186466, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 2.4413743526894875, "language_loss": 0.71899438, "learning_rate": 1.3801882509773548e-06, "loss": 0.74078089, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.4641082286834717 }, { "auxiliary_loss_clip": 0.01148473, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.0458957, "balance_loss_mlp": 1.01994038, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 1.868708455105071, "language_loss": 0.81670088, "learning_rate": 1.3794476772836503e-06, "loss": 0.83846223, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.5375571250915527 }, { "auxiliary_loss_clip": 0.01119633, "auxiliary_loss_mlp": 0.01024019, "balance_loss_clip": 1.0457257, "balance_loss_mlp": 1.01601434, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.5319802667187998, "language_loss": 0.84635758, "learning_rate": 1.3787071977272402e-06, "loss": 0.86779404, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.5589678287506104 }, { "auxiliary_loss_clip": 0.0111084, "auxiliary_loss_mlp": 0.01028323, "balance_loss_clip": 1.04736161, "balance_loss_mlp": 1.02093482, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 3.1281301667762227, "language_loss": 0.72140563, "learning_rate": 1.3779668124204535e-06, "loss": 0.74279726, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.542752504348755 }, { "auxiliary_loss_clip": 0.01136961, "auxiliary_loss_mlp": 0.0102566, "balance_loss_clip": 1.04880309, "balance_loss_mlp": 1.01869869, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.5883424930327767, "language_loss": 0.80849802, "learning_rate": 1.3772265214756074e-06, "loss": 0.83012426, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.513594150543213 }, { "auxiliary_loss_clip": 0.01159231, "auxiliary_loss_mlp": 0.01029037, "balance_loss_clip": 1.04833245, "balance_loss_mlp": 1.0215683, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 1.70103814607756, "language_loss": 0.7483443, "learning_rate": 1.3764863250050025e-06, "loss": 0.77022696, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.4705862998962402 }, { "auxiliary_loss_clip": 0.01128483, "auxiliary_loss_mlp": 0.01022984, "balance_loss_clip": 1.04585886, "balance_loss_mlp": 1.01609039, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 1.7405721520147095, "language_loss": 0.80436003, "learning_rate": 1.3757462231209272e-06, "loss": 0.82587469, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 3.432140588760376 }, { "auxiliary_loss_clip": 0.01138014, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.04875433, "balance_loss_mlp": 1.03068316, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 1.8462632323286408, "language_loss": 0.88827646, "learning_rate": 1.3750062159356525e-06, "loss": 0.91004044, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.537116050720215 }, { "auxiliary_loss_clip": 0.01119972, "auxiliary_loss_mlp": 0.0102801, "balance_loss_clip": 1.0457232, "balance_loss_mlp": 1.02123916, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 1.894764994743442, "language_loss": 0.83217216, "learning_rate": 1.3742663035614382e-06, "loss": 0.853652, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.5497753620147705 }, { "auxiliary_loss_clip": 0.01171924, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.05189514, "balance_loss_mlp": 1.02551568, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 1.6052755787774182, "language_loss": 0.79927951, "learning_rate": 1.3735264861105283e-06, "loss": 0.82132882, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 2.4985053539276123 }, { "auxiliary_loss_clip": 0.01128078, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.04623282, "balance_loss_mlp": 1.01772308, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 1.9466554206314726, "language_loss": 0.78397942, "learning_rate": 1.372786763695152e-06, "loss": 0.80550921, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 2.556319236755371 }, { "auxiliary_loss_clip": 0.01156306, "auxiliary_loss_mlp": 0.01032631, "balance_loss_clip": 1.04867911, "balance_loss_mlp": 1.02462018, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 2.085458628864434, "language_loss": 0.77113438, "learning_rate": 1.3720471364275257e-06, "loss": 0.79302371, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 2.491875648498535 }, { "auxiliary_loss_clip": 0.01121856, "auxiliary_loss_mlp": 0.00761488, "balance_loss_clip": 1.0432353, "balance_loss_mlp": 1.00013697, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 1.900040135435428, "language_loss": 0.78064907, "learning_rate": 1.3713076044198486e-06, "loss": 0.79948252, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 3.3065407276153564 }, { "auxiliary_loss_clip": 0.01137691, "auxiliary_loss_mlp": 0.01026121, "balance_loss_clip": 1.04720807, "balance_loss_mlp": 1.0183723, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 8.833371000956078, "language_loss": 0.81079447, "learning_rate": 1.3705681677843086e-06, "loss": 0.83243263, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.513887405395508 }, { "auxiliary_loss_clip": 0.0106434, "auxiliary_loss_mlp": 0.01001597, "balance_loss_clip": 1.01523805, "balance_loss_mlp": 1.00045896, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.7720923111961804, "language_loss": 0.60613787, "learning_rate": 1.3698288266330768e-06, "loss": 0.62679732, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.1550843715667725 }, { "auxiliary_loss_clip": 0.01139938, "auxiliary_loss_mlp": 0.0102285, "balance_loss_clip": 1.05245185, "balance_loss_mlp": 1.01556969, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 2.5320270139646457, "language_loss": 0.72414708, "learning_rate": 1.3690895810783113e-06, "loss": 0.74577492, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 3.3648622035980225 }, { "auxiliary_loss_clip": 0.01103276, "auxiliary_loss_mlp": 0.00761392, "balance_loss_clip": 1.04260683, "balance_loss_mlp": 1.00017059, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 2.2052196361125, "language_loss": 0.71117556, "learning_rate": 1.3683504312321543e-06, "loss": 0.72982216, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 3.35969614982605 }, { "auxiliary_loss_clip": 0.01158673, "auxiliary_loss_mlp": 0.01027079, "balance_loss_clip": 1.04862237, "balance_loss_mlp": 1.01978612, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 1.9012011811642793, "language_loss": 0.80142009, "learning_rate": 1.3676113772067355e-06, "loss": 0.82327765, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.4974019527435303 }, { "auxiliary_loss_clip": 0.01120972, "auxiliary_loss_mlp": 0.01024398, "balance_loss_clip": 1.04603148, "balance_loss_mlp": 1.01721001, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 2.184381966241592, "language_loss": 0.72748137, "learning_rate": 1.3668724191141671e-06, "loss": 0.7489351, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 2.618596315383911 }, { "auxiliary_loss_clip": 0.01123361, "auxiliary_loss_mlp": 0.01030558, "balance_loss_clip": 1.04983997, "balance_loss_mlp": 1.02372468, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 2.236350660298103, "language_loss": 0.66543901, "learning_rate": 1.3661335570665493e-06, "loss": 0.68697822, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.5453264713287354 }, { "auxiliary_loss_clip": 0.01144514, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.04988217, "balance_loss_mlp": 1.02177, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 2.454641646748015, "language_loss": 0.69908071, "learning_rate": 1.3653947911759676e-06, "loss": 0.72081631, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 2.4968929290771484 }, { "auxiliary_loss_clip": 0.01108398, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.04445302, "balance_loss_mlp": 1.02101469, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.7818902040001832, "language_loss": 0.743451, "learning_rate": 1.3646561215544904e-06, "loss": 0.76482296, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.744920015335083 }, { "auxiliary_loss_clip": 0.01155308, "auxiliary_loss_mlp": 0.01027082, "balance_loss_clip": 1.05056739, "balance_loss_mlp": 1.01963186, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 1.973460872023696, "language_loss": 0.79144919, "learning_rate": 1.363917548314176e-06, "loss": 0.81327307, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.4940006732940674 }, { "auxiliary_loss_clip": 0.01160742, "auxiliary_loss_mlp": 0.01031521, "balance_loss_clip": 1.04994845, "balance_loss_mlp": 1.02404928, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 1.7971877698039849, "language_loss": 0.73122835, "learning_rate": 1.3631790715670626e-06, "loss": 0.75315094, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.499669075012207 }, { "auxiliary_loss_clip": 0.01079876, "auxiliary_loss_mlp": 0.01028371, "balance_loss_clip": 1.04505038, "balance_loss_mlp": 1.02199054, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 1.9748437391176514, "language_loss": 0.85479105, "learning_rate": 1.3624406914251783e-06, "loss": 0.87587357, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 2.7628390789031982 }, { "auxiliary_loss_clip": 0.01155609, "auxiliary_loss_mlp": 0.01029689, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.02309346, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 1.9363700028114572, "language_loss": 0.87913376, "learning_rate": 1.3617024080005335e-06, "loss": 0.90098679, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 2.879237174987793 }, { "auxiliary_loss_clip": 0.01141634, "auxiliary_loss_mlp": 0.00761521, "balance_loss_clip": 1.04713154, "balance_loss_mlp": 1.00014758, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 2.3746447507404786, "language_loss": 0.74315381, "learning_rate": 1.3609642214051266e-06, "loss": 0.76218534, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.669893264770508 }, { "auxiliary_loss_clip": 0.0113348, "auxiliary_loss_mlp": 0.01028135, "balance_loss_clip": 1.04695034, "balance_loss_mlp": 1.02086043, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 1.7206600486248265, "language_loss": 0.66299653, "learning_rate": 1.3602261317509385e-06, "loss": 0.68461269, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.5378570556640625 }, { "auxiliary_loss_clip": 0.01156219, "auxiliary_loss_mlp": 0.01026658, "balance_loss_clip": 1.04955721, "balance_loss_mlp": 1.01865315, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 2.2830852463141955, "language_loss": 0.82961619, "learning_rate": 1.3594881391499387e-06, "loss": 0.85144508, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.4864237308502197 }, { "auxiliary_loss_clip": 0.01141937, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.04813349, "balance_loss_mlp": 1.0250814, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 1.6921306289473252, "language_loss": 0.79324675, "learning_rate": 1.3587502437140778e-06, "loss": 0.81498736, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.5101499557495117 }, { "auxiliary_loss_clip": 0.01143971, "auxiliary_loss_mlp": 0.01026471, "balance_loss_clip": 1.0463047, "balance_loss_mlp": 1.01895475, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 1.9647656160343652, "language_loss": 0.84895867, "learning_rate": 1.3580124455552952e-06, "loss": 0.87066311, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.56144118309021 }, { "auxiliary_loss_clip": 0.01156729, "auxiliary_loss_mlp": 0.00761023, "balance_loss_clip": 1.05109012, "balance_loss_mlp": 1.00017369, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 1.8589675293419443, "language_loss": 0.87505305, "learning_rate": 1.3572747447855148e-06, "loss": 0.8942306, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.5181820392608643 }, { "auxiliary_loss_clip": 0.01171773, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.05178034, "balance_loss_mlp": 1.02617419, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 1.8578992538636085, "language_loss": 0.6940375, "learning_rate": 1.356537141516644e-06, "loss": 0.71609545, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 3.2836720943450928 }, { "auxiliary_loss_clip": 0.01156229, "auxiliary_loss_mlp": 0.01029344, "balance_loss_clip": 1.05226302, "balance_loss_mlp": 1.02216816, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 1.7464976920291766, "language_loss": 0.61609071, "learning_rate": 1.3557996358605775e-06, "loss": 0.63794643, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.604205369949341 }, { "auxiliary_loss_clip": 0.01156832, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 1.05020261, "balance_loss_mlp": 1.02052927, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 2.024759929836643, "language_loss": 0.70077312, "learning_rate": 1.3550622279291941e-06, "loss": 0.7226125, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.4953577518463135 }, { "auxiliary_loss_clip": 0.01102677, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.0411247, "balance_loss_mlp": 1.02416015, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 1.349276355731954, "language_loss": 0.8323983, "learning_rate": 1.354324917834358e-06, "loss": 0.8537389, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 2.6244421005249023 }, { "auxiliary_loss_clip": 0.01095351, "auxiliary_loss_mlp": 0.00760731, "balance_loss_clip": 1.04074001, "balance_loss_mlp": 1.00014448, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 1.6489113312843469, "language_loss": 0.76688659, "learning_rate": 1.353587705687918e-06, "loss": 0.78544736, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 2.639486074447632 }, { "auxiliary_loss_clip": 0.01149973, "auxiliary_loss_mlp": 0.01028236, "balance_loss_clip": 1.05194712, "balance_loss_mlp": 1.02047491, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 2.7266679249330115, "language_loss": 0.72912669, "learning_rate": 1.3528505916017096e-06, "loss": 0.75090879, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 2.4892218112945557 }, { "auxiliary_loss_clip": 0.0115537, "auxiliary_loss_mlp": 0.01030674, "balance_loss_clip": 1.04825783, "balance_loss_mlp": 1.02340794, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 2.2556958409390844, "language_loss": 0.88579953, "learning_rate": 1.3521135756875514e-06, "loss": 0.90765995, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 3.2697927951812744 }, { "auxiliary_loss_clip": 0.01091419, "auxiliary_loss_mlp": 0.01029873, "balance_loss_clip": 1.04314935, "balance_loss_mlp": 1.02347171, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.473702870846425, "language_loss": 0.8590433, "learning_rate": 1.3513766580572496e-06, "loss": 0.88025618, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 2.6853346824645996 }, { "auxiliary_loss_clip": 0.01155136, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.04974079, "balance_loss_mlp": 1.02535152, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.2318019701687914, "language_loss": 0.77394843, "learning_rate": 1.3506398388225924e-06, "loss": 0.79582667, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.4916577339172363 }, { "auxiliary_loss_clip": 0.0116945, "auxiliary_loss_mlp": 0.01029731, "balance_loss_clip": 1.05154431, "balance_loss_mlp": 1.02271843, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 1.6473480401905296, "language_loss": 0.71852016, "learning_rate": 1.349903118095355e-06, "loss": 0.74051201, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 3.6419479846954346 }, { "auxiliary_loss_clip": 0.01160027, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.05050302, "balance_loss_mlp": 1.01897502, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 1.704792285248535, "language_loss": 0.73468184, "learning_rate": 1.349166495987298e-06, "loss": 0.75654042, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 3.1654062271118164 }, { "auxiliary_loss_clip": 0.01039519, "auxiliary_loss_mlp": 0.01004577, "balance_loss_clip": 1.01569462, "balance_loss_mlp": 1.00334346, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.8190713262003482, "language_loss": 0.60926819, "learning_rate": 1.348429972610166e-06, "loss": 0.62970912, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.1971275806427 }, { "auxiliary_loss_clip": 0.01017331, "auxiliary_loss_mlp": 0.01005008, "balance_loss_clip": 1.01587629, "balance_loss_mlp": 1.00360155, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 1.1309420185634191, "language_loss": 0.57801795, "learning_rate": 1.3476935480756897e-06, "loss": 0.59824133, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 3.046581268310547 }, { "auxiliary_loss_clip": 0.01119105, "auxiliary_loss_mlp": 0.0102817, "balance_loss_clip": 1.04420412, "balance_loss_mlp": 1.02076983, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 2.159644096005312, "language_loss": 0.75553846, "learning_rate": 1.346957222495583e-06, "loss": 0.77701128, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.6107821464538574 }, { "auxiliary_loss_clip": 0.01146568, "auxiliary_loss_mlp": 0.00761105, "balance_loss_clip": 1.05201781, "balance_loss_mlp": 1.00016606, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 2.487076116597684, "language_loss": 0.71079648, "learning_rate": 1.3462209959815466e-06, "loss": 0.72987324, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.4936232566833496 }, { "auxiliary_loss_clip": 0.0114354, "auxiliary_loss_mlp": 0.01025093, "balance_loss_clip": 1.04907668, "balance_loss_mlp": 1.01835763, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 2.0658169266479582, "language_loss": 0.7439509, "learning_rate": 1.345484868645265e-06, "loss": 0.76563722, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.5597472190856934 }, { "auxiliary_loss_clip": 0.01132761, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.0477761, "balance_loss_mlp": 1.02085352, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 2.7320806023250777, "language_loss": 0.78470039, "learning_rate": 1.3447488405984088e-06, "loss": 0.80630523, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.601616621017456 }, { "auxiliary_loss_clip": 0.0114236, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.05124903, "balance_loss_mlp": 1.02309775, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 2.1072495627136583, "language_loss": 0.69734251, "learning_rate": 1.3440129119526322e-06, "loss": 0.71906966, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.6859753131866455 }, { "auxiliary_loss_clip": 0.010656, "auxiliary_loss_mlp": 0.01002826, "balance_loss_clip": 1.01699114, "balance_loss_mlp": 1.00170505, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 0.7979084947118404, "language_loss": 0.51171196, "learning_rate": 1.3432770828195762e-06, "loss": 0.5323962, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.256830930709839 }, { "auxiliary_loss_clip": 0.01117193, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.04363632, "balance_loss_mlp": 1.02646184, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.224131915559719, "language_loss": 0.70293677, "learning_rate": 1.3425413533108635e-06, "loss": 0.72444624, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 2.5785207748413086 }, { "auxiliary_loss_clip": 0.01117583, "auxiliary_loss_mlp": 0.01033547, "balance_loss_clip": 1.04955065, "balance_loss_mlp": 1.02634668, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 3.0625458382975044, "language_loss": 0.70477664, "learning_rate": 1.341805723538105e-06, "loss": 0.72628796, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 2.6237895488739014 }, { "auxiliary_loss_clip": 0.01146921, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.04944217, "balance_loss_mlp": 1.02434146, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.9601492941775835, "language_loss": 0.77373695, "learning_rate": 1.3410701936128948e-06, "loss": 0.79552227, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.703033924102783 }, { "auxiliary_loss_clip": 0.01156372, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.05088019, "balance_loss_mlp": 1.02471232, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 2.480518609282498, "language_loss": 0.84703076, "learning_rate": 1.340334763646812e-06, "loss": 0.86891448, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.509705066680908 }, { "auxiliary_loss_clip": 0.01173382, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 1.05239439, "balance_loss_mlp": 1.0265913, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 1.8103273862304827, "language_loss": 0.74371338, "learning_rate": 1.3395994337514218e-06, "loss": 0.76578808, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.517326831817627 }, { "auxiliary_loss_clip": 0.01146696, "auxiliary_loss_mlp": 0.01029153, "balance_loss_clip": 1.04609561, "balance_loss_mlp": 1.02197051, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 1.5218938877917312, "language_loss": 0.78868079, "learning_rate": 1.3388642040382725e-06, "loss": 0.81043923, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.561779499053955 }, { "auxiliary_loss_clip": 0.01130573, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.04388106, "balance_loss_mlp": 1.02221751, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 1.640214104135997, "language_loss": 0.84181845, "learning_rate": 1.3381290746188975e-06, "loss": 0.86342019, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 3.484020709991455 }, { "auxiliary_loss_clip": 0.01160113, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.05448604, "balance_loss_mlp": 1.02158153, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 1.7873049273108141, "language_loss": 0.67142868, "learning_rate": 1.3373940456048152e-06, "loss": 0.69331622, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.567626714706421 }, { "auxiliary_loss_clip": 0.01168376, "auxiliary_loss_mlp": 0.01028632, "balance_loss_clip": 1.05033398, "balance_loss_mlp": 1.02149451, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 1.6964579370371657, "language_loss": 0.59114981, "learning_rate": 1.3366591171075299e-06, "loss": 0.6131199, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.6399786472320557 }, { "auxiliary_loss_clip": 0.01140954, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.0498271, "balance_loss_mlp": 1.02180552, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 1.9983548170306384, "language_loss": 0.91059065, "learning_rate": 1.335924289238529e-06, "loss": 0.93228179, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.5936784744262695 }, { "auxiliary_loss_clip": 0.01156047, "auxiliary_loss_mlp": 0.00760572, "balance_loss_clip": 1.05357289, "balance_loss_mlp": 1.00015712, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 2.3335301913165183, "language_loss": 0.76665592, "learning_rate": 1.3351895621092859e-06, "loss": 0.78582215, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 2.544772148132324 }, { "auxiliary_loss_clip": 0.01057259, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.03254128, "balance_loss_mlp": 1.01883411, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 2.2905575702297014, "language_loss": 0.76629651, "learning_rate": 1.3344549358312567e-06, "loss": 0.78712487, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 2.8105413913726807 }, { "auxiliary_loss_clip": 0.01161015, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.05375409, "balance_loss_mlp": 1.01987123, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 1.8226751688676548, "language_loss": 0.78392243, "learning_rate": 1.3337204105158852e-06, "loss": 0.80580235, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 2.7605700492858887 }, { "auxiliary_loss_clip": 0.01112349, "auxiliary_loss_mlp": 0.01031491, "balance_loss_clip": 1.03800929, "balance_loss_mlp": 1.02471375, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 1.9816947116489763, "language_loss": 0.73077148, "learning_rate": 1.332985986274597e-06, "loss": 0.7522099, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 3.3562145233154297 }, { "auxiliary_loss_clip": 0.01093804, "auxiliary_loss_mlp": 0.00760622, "balance_loss_clip": 1.04363656, "balance_loss_mlp": 1.00017738, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 2.052623051624734, "language_loss": 0.75271773, "learning_rate": 1.3322516632188047e-06, "loss": 0.77126199, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 2.6294710636138916 }, { "auxiliary_loss_clip": 0.01125089, "auxiliary_loss_mlp": 0.01030069, "balance_loss_clip": 1.04587913, "balance_loss_mlp": 1.02250206, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 1.8071717079403955, "language_loss": 0.67194307, "learning_rate": 1.3315174414599045e-06, "loss": 0.69349468, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.630873441696167 }, { "auxiliary_loss_clip": 0.01149249, "auxiliary_loss_mlp": 0.01028262, "balance_loss_clip": 1.04702973, "balance_loss_mlp": 1.02090061, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 1.720232384646922, "language_loss": 0.75248718, "learning_rate": 1.3307833211092768e-06, "loss": 0.77426225, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 3.3772566318511963 }, { "auxiliary_loss_clip": 0.01169966, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.05132747, "balance_loss_mlp": 1.02909112, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 1.5449880578574986, "language_loss": 0.75166082, "learning_rate": 1.3300493022782873e-06, "loss": 0.77372575, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 3.2620089054107666 }, { "auxiliary_loss_clip": 0.01101538, "auxiliary_loss_mlp": 0.00760727, "balance_loss_clip": 1.04208565, "balance_loss_mlp": 1.00012589, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 1.7828310269634837, "language_loss": 0.72114384, "learning_rate": 1.3293153850782855e-06, "loss": 0.73976648, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.5783932209014893 }, { "auxiliary_loss_clip": 0.01116894, "auxiliary_loss_mlp": 0.01033176, "balance_loss_clip": 1.04381883, "balance_loss_mlp": 1.02476037, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 1.7214100184002883, "language_loss": 0.71327335, "learning_rate": 1.3285815696206069e-06, "loss": 0.73477405, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 2.5998733043670654 }, { "auxiliary_loss_clip": 0.01125252, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.04244089, "balance_loss_mlp": 1.02634585, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 1.7170823976815268, "language_loss": 0.76853412, "learning_rate": 1.32784785601657e-06, "loss": 0.79012764, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 2.607658624649048 }, { "auxiliary_loss_clip": 0.0114339, "auxiliary_loss_mlp": 0.01025723, "balance_loss_clip": 1.04639435, "balance_loss_mlp": 1.01841879, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 1.7215905297211473, "language_loss": 0.74007201, "learning_rate": 1.3271142443774798e-06, "loss": 0.76176322, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.665806293487549 }, { "auxiliary_loss_clip": 0.01138201, "auxiliary_loss_mlp": 0.01028395, "balance_loss_clip": 1.04683566, "balance_loss_mlp": 1.02146018, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 1.6689304339663622, "language_loss": 0.8140825, "learning_rate": 1.3263807348146228e-06, "loss": 0.83574843, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.6081652641296387 }, { "auxiliary_loss_clip": 0.01137896, "auxiliary_loss_mlp": 0.01029873, "balance_loss_clip": 1.04421425, "balance_loss_mlp": 1.02155793, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 1.8873975851623666, "language_loss": 0.73288655, "learning_rate": 1.3256473274392733e-06, "loss": 0.75456429, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.6589717864990234 }, { "auxiliary_loss_clip": 0.01168509, "auxiliary_loss_mlp": 0.01034472, "balance_loss_clip": 1.05080104, "balance_loss_mlp": 1.02684855, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 2.283986737012843, "language_loss": 0.70028925, "learning_rate": 1.3249140223626873e-06, "loss": 0.72231913, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.5960729122161865 }, { "auxiliary_loss_clip": 0.01153583, "auxiliary_loss_mlp": 0.01026763, "balance_loss_clip": 1.0499289, "balance_loss_mlp": 1.01989996, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 1.9727413954173814, "language_loss": 0.75221986, "learning_rate": 1.3241808196961077e-06, "loss": 0.77402329, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 2.570199489593506 }, { "auxiliary_loss_clip": 0.0112971, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 1.04542994, "balance_loss_mlp": 1.02105117, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.6366827693567902, "language_loss": 0.7063421, "learning_rate": 1.3234477195507608e-06, "loss": 0.72791779, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.5499320030212402 }, { "auxiliary_loss_clip": 0.01127864, "auxiliary_loss_mlp": 0.01029059, "balance_loss_clip": 1.04820776, "balance_loss_mlp": 1.0218972, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 2.716323751079925, "language_loss": 0.62144917, "learning_rate": 1.322714722037857e-06, "loss": 0.64301836, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 2.7369654178619385 }, { "auxiliary_loss_clip": 0.01134497, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.04579306, "balance_loss_mlp": 1.02208281, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 1.951617636745396, "language_loss": 0.77565312, "learning_rate": 1.321981827268591e-06, "loss": 0.79729784, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.6371102333068848 }, { "auxiliary_loss_clip": 0.01143694, "auxiliary_loss_mlp": 0.01033428, "balance_loss_clip": 1.04692936, "balance_loss_mlp": 1.02657628, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 1.8334906426195423, "language_loss": 0.81656498, "learning_rate": 1.3212490353541426e-06, "loss": 0.83833623, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.5682530403137207 }, { "auxiliary_loss_clip": 0.01168685, "auxiliary_loss_mlp": 0.01026438, "balance_loss_clip": 1.04919744, "balance_loss_mlp": 1.01868677, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 1.8543169350127724, "language_loss": 0.80459416, "learning_rate": 1.3205163464056762e-06, "loss": 0.82654536, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.470243453979492 }, { "auxiliary_loss_clip": 0.01151909, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.04816699, "balance_loss_mlp": 1.0228653, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 1.7124061624158653, "language_loss": 0.72822261, "learning_rate": 1.319783760534339e-06, "loss": 0.75003749, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.537566900253296 }, { "auxiliary_loss_clip": 0.01156203, "auxiliary_loss_mlp": 0.01031271, "balance_loss_clip": 1.05073893, "balance_loss_mlp": 1.02398741, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.0431924889410684, "language_loss": 0.75563133, "learning_rate": 1.319051277851266e-06, "loss": 0.77750605, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 3.6371822357177734 }, { "auxiliary_loss_clip": 0.01157383, "auxiliary_loss_mlp": 0.01023611, "balance_loss_clip": 1.05157399, "balance_loss_mlp": 1.01700425, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 2.0157851450972806, "language_loss": 0.83789229, "learning_rate": 1.3183188984675716e-06, "loss": 0.85970223, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.494339942932129 }, { "auxiliary_loss_clip": 0.01140615, "auxiliary_loss_mlp": 0.01022117, "balance_loss_clip": 1.04869926, "balance_loss_mlp": 1.01505971, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 2.3001817730458742, "language_loss": 0.71657372, "learning_rate": 1.3175866224943586e-06, "loss": 0.73820108, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.5685248374938965 }, { "auxiliary_loss_clip": 0.01146636, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.04965425, "balance_loss_mlp": 1.02412367, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 2.040748052949397, "language_loss": 0.73570776, "learning_rate": 1.316854450042712e-06, "loss": 0.75748885, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.5504019260406494 }, { "auxiliary_loss_clip": 0.01160558, "auxiliary_loss_mlp": 0.01028748, "balance_loss_clip": 1.0512526, "balance_loss_mlp": 1.02120233, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 1.832917914034419, "language_loss": 0.74279761, "learning_rate": 1.3161223812237024e-06, "loss": 0.76469064, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 2.512796640396118 }, { "auxiliary_loss_clip": 0.01167862, "auxiliary_loss_mlp": 0.01026672, "balance_loss_clip": 1.0491606, "balance_loss_mlp": 1.01954043, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 2.2559578846950523, "language_loss": 0.85758203, "learning_rate": 1.3153904161483842e-06, "loss": 0.87952733, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.4368677139282227 }, { "auxiliary_loss_clip": 0.01124765, "auxiliary_loss_mlp": 0.01026854, "balance_loss_clip": 1.04492068, "balance_loss_mlp": 1.01929021, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 2.0160247309044403, "language_loss": 0.85559416, "learning_rate": 1.3146585549277953e-06, "loss": 0.8771103, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 2.6080188751220703 }, { "auxiliary_loss_clip": 0.01147027, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.04961109, "balance_loss_mlp": 1.02239776, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 3.0077358829789462, "language_loss": 0.78319687, "learning_rate": 1.3139267976729591e-06, "loss": 0.80495954, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 3.321894645690918 }, { "auxiliary_loss_clip": 0.01159637, "auxiliary_loss_mlp": 0.01028486, "balance_loss_clip": 1.05029225, "balance_loss_mlp": 1.02112436, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 1.7583238406418198, "language_loss": 0.71962136, "learning_rate": 1.3131951444948815e-06, "loss": 0.74150258, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 2.6119823455810547 }, { "auxiliary_loss_clip": 0.01148265, "auxiliary_loss_mlp": 0.01024249, "balance_loss_clip": 1.05118251, "balance_loss_mlp": 1.0170188, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 1.9355787623256409, "language_loss": 0.75934672, "learning_rate": 1.3124635955045546e-06, "loss": 0.78107178, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.5423145294189453 }, { "auxiliary_loss_clip": 0.01099455, "auxiliary_loss_mlp": 0.00760478, "balance_loss_clip": 1.04134107, "balance_loss_mlp": 1.00017524, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 1.768472984692508, "language_loss": 0.84171474, "learning_rate": 1.3117321508129537e-06, "loss": 0.86031413, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 3.4371814727783203 }, { "auxiliary_loss_clip": 0.01146794, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.05001211, "balance_loss_mlp": 1.02262521, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 1.52837673526448, "language_loss": 0.76622415, "learning_rate": 1.3110008105310388e-06, "loss": 0.78798872, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.5734033584594727 }, { "auxiliary_loss_clip": 0.01168696, "auxiliary_loss_mlp": 0.01024289, "balance_loss_clip": 1.04785001, "balance_loss_mlp": 1.01701713, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 1.600723080134172, "language_loss": 0.77884406, "learning_rate": 1.3102695747697526e-06, "loss": 0.80077398, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 3.2823331356048584 }, { "auxiliary_loss_clip": 0.01102006, "auxiliary_loss_mlp": 0.01035022, "balance_loss_clip": 1.04736614, "balance_loss_mlp": 1.02757716, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 2.3129180404277667, "language_loss": 0.90380132, "learning_rate": 1.3095384436400237e-06, "loss": 0.92517161, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.560361385345459 }, { "auxiliary_loss_clip": 0.01146146, "auxiliary_loss_mlp": 0.01028129, "balance_loss_clip": 1.04902518, "balance_loss_mlp": 1.02074957, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 1.9810178738965096, "language_loss": 0.8193481, "learning_rate": 1.3088074172527633e-06, "loss": 0.8410908, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 2.5189342498779297 }, { "auxiliary_loss_clip": 0.01143915, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.04769528, "balance_loss_mlp": 1.02675676, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 2.20769104268534, "language_loss": 0.71468085, "learning_rate": 1.3080764957188684e-06, "loss": 0.73646557, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.6017136573791504 }, { "auxiliary_loss_clip": 0.01112981, "auxiliary_loss_mlp": 0.01027448, "balance_loss_clip": 1.04456329, "balance_loss_mlp": 1.02017629, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 1.7542403417941204, "language_loss": 0.70396137, "learning_rate": 1.3073456791492192e-06, "loss": 0.72536564, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.618889808654785 }, { "auxiliary_loss_clip": 0.01142517, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.04556227, "balance_loss_mlp": 1.02089298, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 1.8197945253148466, "language_loss": 0.78072929, "learning_rate": 1.3066149676546801e-06, "loss": 0.80243707, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.5318827629089355 }, { "auxiliary_loss_clip": 0.011379, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.04870832, "balance_loss_mlp": 1.02213132, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 1.7766372821926621, "language_loss": 0.66128749, "learning_rate": 1.3058843613460985e-06, "loss": 0.68296218, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.5914816856384277 }, { "auxiliary_loss_clip": 0.01134029, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.04646051, "balance_loss_mlp": 1.02093291, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 2.0117209516271513, "language_loss": 0.74408537, "learning_rate": 1.3051538603343075e-06, "loss": 0.76570976, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 2.568589925765991 }, { "auxiliary_loss_clip": 0.01157628, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.05144441, "balance_loss_mlp": 1.02542841, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 1.794311297939756, "language_loss": 0.67598224, "learning_rate": 1.3044234647301235e-06, "loss": 0.69788355, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.496995210647583 }, { "auxiliary_loss_clip": 0.01153044, "auxiliary_loss_mlp": 0.01026313, "balance_loss_clip": 1.04899132, "balance_loss_mlp": 1.01918125, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.7610892013042936, "language_loss": 0.72327089, "learning_rate": 1.303693174644347e-06, "loss": 0.7450645, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 2.4909324645996094 }, { "auxiliary_loss_clip": 0.01136989, "auxiliary_loss_mlp": 0.01028997, "balance_loss_clip": 1.04617727, "balance_loss_mlp": 1.02153397, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 1.9314742743347986, "language_loss": 0.80449784, "learning_rate": 1.3029629901877625e-06, "loss": 0.82615769, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.5404744148254395 }, { "auxiliary_loss_clip": 0.01159302, "auxiliary_loss_mlp": 0.01023337, "balance_loss_clip": 1.05096984, "balance_loss_mlp": 1.01540923, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 2.7731438562953623, "language_loss": 0.77385223, "learning_rate": 1.3022329114711376e-06, "loss": 0.79567862, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.4914488792419434 }, { "auxiliary_loss_clip": 0.01136987, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.04812264, "balance_loss_mlp": 1.01947665, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 1.7919818908305056, "language_loss": 0.6962198, "learning_rate": 1.3015029386052256e-06, "loss": 0.71785975, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.5591471195220947 }, { "auxiliary_loss_clip": 0.011369, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.04892445, "balance_loss_mlp": 1.0240171, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 1.8315489891565093, "language_loss": 0.72909236, "learning_rate": 1.3007730717007622e-06, "loss": 0.75077528, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.6530284881591797 }, { "auxiliary_loss_clip": 0.01172937, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 1.05205774, "balance_loss_mlp": 1.02067447, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.8346682050444139, "language_loss": 0.75605935, "learning_rate": 1.3000433108684676e-06, "loss": 0.77807581, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 0.6659543514251709 }, { "auxiliary_loss_clip": 0.01154229, "auxiliary_loss_mlp": 0.01025495, "balance_loss_clip": 1.05041695, "balance_loss_mlp": 1.01802015, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 2.541733654732183, "language_loss": 0.80216455, "learning_rate": 1.2993136562190467e-06, "loss": 0.82396173, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.5568699836730957 }, { "auxiliary_loss_clip": 0.01147457, "auxiliary_loss_mlp": 0.01025065, "balance_loss_clip": 1.04928041, "balance_loss_mlp": 1.01796317, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 1.5321611409742733, "language_loss": 0.70474744, "learning_rate": 1.2985841078631871e-06, "loss": 0.72647274, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.5611369609832764 }, { "auxiliary_loss_clip": 0.01097131, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.03868425, "balance_loss_mlp": 1.02524996, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 1.7145014743278557, "language_loss": 0.78358847, "learning_rate": 1.2978546659115608e-06, "loss": 0.80488747, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.655010223388672 }, { "auxiliary_loss_clip": 0.01146859, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.04952753, "balance_loss_mlp": 1.0237726, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 1.8766782093286296, "language_loss": 0.85402501, "learning_rate": 1.2971253304748228e-06, "loss": 0.87580097, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 2.534308671951294 }, { "auxiliary_loss_clip": 0.01159732, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 1.05185747, "balance_loss_mlp": 1.02463484, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 1.699075465959052, "language_loss": 0.74669278, "learning_rate": 1.296396101663614e-06, "loss": 0.76861668, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.4956533908843994 }, { "auxiliary_loss_clip": 0.0116016, "auxiliary_loss_mlp": 0.01027557, "balance_loss_clip": 1.05104709, "balance_loss_mlp": 1.01968634, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 2.2929426295305912, "language_loss": 0.84167844, "learning_rate": 1.2956669795885565e-06, "loss": 0.86355561, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 2.5008697509765625 }, { "auxiliary_loss_clip": 0.0112475, "auxiliary_loss_mlp": 0.01038334, "balance_loss_clip": 1.04937518, "balance_loss_mlp": 1.03019166, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 1.9414601215441698, "language_loss": 0.68169481, "learning_rate": 1.294937964360259e-06, "loss": 0.70332569, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 3.4493167400360107 }, { "auxiliary_loss_clip": 0.01145109, "auxiliary_loss_mlp": 0.01027528, "balance_loss_clip": 1.04747438, "balance_loss_mlp": 1.01966906, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 2.2348863702348964, "language_loss": 0.71896529, "learning_rate": 1.2942090560893108e-06, "loss": 0.74069166, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.589372396469116 }, { "auxiliary_loss_clip": 0.01169763, "auxiliary_loss_mlp": 0.01030486, "balance_loss_clip": 1.05144584, "balance_loss_mlp": 1.02317834, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 1.8133664996820804, "language_loss": 0.60170698, "learning_rate": 1.2934802548862882e-06, "loss": 0.6237095, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.6099376678466797 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01025256, "balance_loss_clip": 1.04686618, "balance_loss_mlp": 1.01835346, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 1.929557069517171, "language_loss": 0.82623327, "learning_rate": 1.292751560861749e-06, "loss": 0.84788275, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 3.336229085922241 }, { "auxiliary_loss_clip": 0.01174293, "auxiliary_loss_mlp": 0.0102262, "balance_loss_clip": 1.05301905, "balance_loss_mlp": 1.01513422, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.6852147687894672, "language_loss": 0.7974298, "learning_rate": 1.2920229741262354e-06, "loss": 0.819399, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.499915838241577 }, { "auxiliary_loss_clip": 0.01146073, "auxiliary_loss_mlp": 0.01024091, "balance_loss_clip": 1.0495106, "balance_loss_mlp": 1.01677203, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.1766925770676577, "language_loss": 0.75174648, "learning_rate": 1.2912944947902739e-06, "loss": 0.77344811, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 3.2579081058502197 }, { "auxiliary_loss_clip": 0.01148193, "auxiliary_loss_mlp": 0.01030456, "balance_loss_clip": 1.04871202, "balance_loss_mlp": 1.02298164, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 2.3897229614298356, "language_loss": 0.71500623, "learning_rate": 1.2905661229643742e-06, "loss": 0.73679268, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.6466479301452637 }, { "auxiliary_loss_clip": 0.01171117, "auxiliary_loss_mlp": 0.01027445, "balance_loss_clip": 1.05068719, "balance_loss_mlp": 1.0199405, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 2.146285455772101, "language_loss": 0.84430504, "learning_rate": 1.2898378587590299e-06, "loss": 0.86629069, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 2.451789140701294 }, { "auxiliary_loss_clip": 0.01155099, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.05147719, "balance_loss_mlp": 1.02666497, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 1.702420332328967, "language_loss": 0.87213141, "learning_rate": 1.2891097022847173e-06, "loss": 0.89402312, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.472630023956299 }, { "auxiliary_loss_clip": 0.01146782, "auxiliary_loss_mlp": 0.01032374, "balance_loss_clip": 1.05018306, "balance_loss_mlp": 1.02408314, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 1.8197510911623902, "language_loss": 0.66592324, "learning_rate": 1.2883816536518978e-06, "loss": 0.68771487, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.599257469177246 }, { "auxiliary_loss_clip": 0.01154425, "auxiliary_loss_mlp": 0.0102718, "balance_loss_clip": 1.04974794, "balance_loss_mlp": 1.01989055, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 1.8153244458861182, "language_loss": 0.81823868, "learning_rate": 1.2876537129710155e-06, "loss": 0.84005469, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.5419886112213135 }, { "auxiliary_loss_clip": 0.01137331, "auxiliary_loss_mlp": 0.01028481, "balance_loss_clip": 1.0506041, "balance_loss_mlp": 1.02048206, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 1.9794411490862094, "language_loss": 0.75545585, "learning_rate": 1.286925880352499e-06, "loss": 0.77711391, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.527164936065674 }, { "auxiliary_loss_clip": 0.0113687, "auxiliary_loss_mlp": 0.01027478, "balance_loss_clip": 1.0460633, "balance_loss_mlp": 1.01998818, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 1.8570717461055009, "language_loss": 0.7139672, "learning_rate": 1.2861981559067592e-06, "loss": 0.7356106, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 2.5783236026763916 }, { "auxiliary_loss_clip": 0.01107449, "auxiliary_loss_mlp": 0.01024124, "balance_loss_clip": 1.04382324, "balance_loss_mlp": 1.01718581, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 1.9140501614911964, "language_loss": 0.80344212, "learning_rate": 1.2854705397441917e-06, "loss": 0.82475781, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 2.5935115814208984 }, { "auxiliary_loss_clip": 0.01126249, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.04588342, "balance_loss_mlp": 1.02028453, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 2.0061590324379353, "language_loss": 0.77362049, "learning_rate": 1.2847430319751747e-06, "loss": 0.79515314, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 2.6301941871643066 }, { "auxiliary_loss_clip": 0.01151667, "auxiliary_loss_mlp": 0.0103216, "balance_loss_clip": 1.05072439, "balance_loss_mlp": 1.0248909, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 2.2119271624858143, "language_loss": 0.67226642, "learning_rate": 1.2840156327100712e-06, "loss": 0.69410467, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 2.5097033977508545 }, { "auxiliary_loss_clip": 0.01171648, "auxiliary_loss_mlp": 0.01025604, "balance_loss_clip": 1.05217803, "balance_loss_mlp": 1.01820135, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 1.7317833074892703, "language_loss": 0.72275043, "learning_rate": 1.2832883420592272e-06, "loss": 0.74472296, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.5106117725372314 }, { "auxiliary_loss_clip": 0.01138828, "auxiliary_loss_mlp": 0.01027795, "balance_loss_clip": 1.04916835, "balance_loss_mlp": 1.02081525, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 2.2476171281782773, "language_loss": 0.64646047, "learning_rate": 1.282561160132972e-06, "loss": 0.6681267, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.661320686340332 }, { "auxiliary_loss_clip": 0.01147553, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.04704201, "balance_loss_mlp": 1.02475512, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 1.5761578062372898, "language_loss": 0.80815202, "learning_rate": 1.2818340870416186e-06, "loss": 0.8299495, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.6052603721618652 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01028382, "balance_loss_clip": 1.04452109, "balance_loss_mlp": 1.0204246, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 1.8138038511275894, "language_loss": 0.75804353, "learning_rate": 1.2811071228954626e-06, "loss": 0.77967697, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 3.3870127201080322 }, { "auxiliary_loss_clip": 0.01140014, "auxiliary_loss_mlp": 0.01027383, "balance_loss_clip": 1.04765677, "balance_loss_mlp": 1.02028358, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 1.9712488372984, "language_loss": 0.80846465, "learning_rate": 1.2803802678047846e-06, "loss": 0.83013856, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.5906550884246826 }, { "auxiliary_loss_clip": 0.01148906, "auxiliary_loss_mlp": 0.01032897, "balance_loss_clip": 1.05155087, "balance_loss_mlp": 1.0247308, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 1.6809937527976335, "language_loss": 0.73997855, "learning_rate": 1.279653521879848e-06, "loss": 0.76179653, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.541074752807617 }, { "auxiliary_loss_clip": 0.01079756, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.04111624, "balance_loss_mlp": 1.02635968, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 2.0193330013215007, "language_loss": 0.83691359, "learning_rate": 1.2789268852308997e-06, "loss": 0.85804379, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.633125066757202 }, { "auxiliary_loss_clip": 0.01148882, "auxiliary_loss_mlp": 0.01029244, "balance_loss_clip": 1.04881716, "balance_loss_mlp": 1.02206719, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 1.85697216489786, "language_loss": 0.70357013, "learning_rate": 1.2782003579681688e-06, "loss": 0.72535145, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 2.5269412994384766 }, { "auxiliary_loss_clip": 0.01175377, "auxiliary_loss_mlp": 0.01033884, "balance_loss_clip": 1.05518019, "balance_loss_mlp": 1.02606678, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 1.578646478280308, "language_loss": 0.74172223, "learning_rate": 1.2774739402018701e-06, "loss": 0.76381481, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.5098023414611816 }, { "auxiliary_loss_clip": 0.0115308, "auxiliary_loss_mlp": 0.01029405, "balance_loss_clip": 1.04943681, "balance_loss_mlp": 1.02154374, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 1.598995911542712, "language_loss": 0.7286483, "learning_rate": 1.2767476320422002e-06, "loss": 0.75047314, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 2.5157248973846436 }, { "auxiliary_loss_clip": 0.01037167, "auxiliary_loss_mlp": 0.01002988, "balance_loss_clip": 1.01615548, "balance_loss_mlp": 1.00164688, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.6795422908223762, "language_loss": 0.57200038, "learning_rate": 1.2760214335993392e-06, "loss": 0.59240198, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 4.00806188583374 }, { "auxiliary_loss_clip": 0.01148257, "auxiliary_loss_mlp": 0.01026214, "balance_loss_clip": 1.04674411, "balance_loss_mlp": 1.01939559, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 2.1005456680586327, "language_loss": 0.58687347, "learning_rate": 1.2752953449834514e-06, "loss": 0.60861814, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 2.6186487674713135 }, { "auxiliary_loss_clip": 0.01168195, "auxiliary_loss_mlp": 0.01026622, "balance_loss_clip": 1.05009174, "balance_loss_mlp": 1.01955247, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 1.5590057609326045, "language_loss": 0.80316848, "learning_rate": 1.2745693663046836e-06, "loss": 0.82511663, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.472851514816284 }, { "auxiliary_loss_clip": 0.01153379, "auxiliary_loss_mlp": 0.01023058, "balance_loss_clip": 1.05023289, "balance_loss_mlp": 1.01629281, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 1.7347657083228893, "language_loss": 0.80440217, "learning_rate": 1.2738434976731662e-06, "loss": 0.82616651, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 3.3013107776641846 }, { "auxiliary_loss_clip": 0.01143186, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.05051565, "balance_loss_mlp": 1.02054608, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 1.6899578468461065, "language_loss": 0.75417429, "learning_rate": 1.2731177391990125e-06, "loss": 0.77588552, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 3.2653064727783203 }, { "auxiliary_loss_clip": 0.01142885, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.04751539, "balance_loss_mlp": 1.01967192, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 3.9175031458442633, "language_loss": 0.81791008, "learning_rate": 1.2723920909923203e-06, "loss": 0.83960867, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.504490613937378 }, { "auxiliary_loss_clip": 0.01066408, "auxiliary_loss_mlp": 0.01002761, "balance_loss_clip": 1.01783037, "balance_loss_mlp": 1.00162268, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 0.847045568964251, "language_loss": 0.60465908, "learning_rate": 1.2716665531631688e-06, "loss": 0.62535071, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 3.0114898681640625 }, { "auxiliary_loss_clip": 0.01158583, "auxiliary_loss_mlp": 0.01024111, "balance_loss_clip": 1.04941559, "balance_loss_mlp": 1.0167532, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 1.6242927907048788, "language_loss": 0.77171874, "learning_rate": 1.270941125821623e-06, "loss": 0.79354572, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 2.5108468532562256 }, { "auxiliary_loss_clip": 0.01151103, "auxiliary_loss_mlp": 0.01027164, "balance_loss_clip": 1.04716671, "balance_loss_mlp": 1.01977921, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.5762895922832727, "language_loss": 0.75326675, "learning_rate": 1.2702158090777278e-06, "loss": 0.77504945, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.5658206939697266 }, { "auxiliary_loss_clip": 0.01124019, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.04518557, "balance_loss_mlp": 1.02541709, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 1.841698726774463, "language_loss": 0.74236083, "learning_rate": 1.2694906030415148e-06, "loss": 0.76392281, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.6280767917633057 }, { "auxiliary_loss_clip": 0.01152035, "auxiliary_loss_mlp": 0.01028711, "balance_loss_clip": 1.049317, "balance_loss_mlp": 1.02142739, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 2.3355892446748956, "language_loss": 0.82109106, "learning_rate": 1.2687655078229958e-06, "loss": 0.84289849, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.483689785003662 }, { "auxiliary_loss_clip": 0.01141443, "auxiliary_loss_mlp": 0.01032853, "balance_loss_clip": 1.04911995, "balance_loss_mlp": 1.02563179, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 2.4028599496150136, "language_loss": 0.69385219, "learning_rate": 1.2680405235321678e-06, "loss": 0.71559513, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.6019532680511475 }, { "auxiliary_loss_clip": 0.01146026, "auxiliary_loss_mlp": 0.00761629, "balance_loss_clip": 1.05246842, "balance_loss_mlp": 1.00022018, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 2.1548742754596018, "language_loss": 0.7840693, "learning_rate": 1.267315650279011e-06, "loss": 0.80314589, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 2.5243239402770996 }, { "auxiliary_loss_clip": 0.0112408, "auxiliary_loss_mlp": 0.01025901, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.01900232, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 1.7956486715892954, "language_loss": 0.74300712, "learning_rate": 1.2665908881734874e-06, "loss": 0.76450694, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 2.56534743309021 }, { "auxiliary_loss_clip": 0.01155315, "auxiliary_loss_mlp": 0.01027075, "balance_loss_clip": 1.04957557, "balance_loss_mlp": 1.01992297, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 2.205427431304428, "language_loss": 0.85000455, "learning_rate": 1.2658662373255432e-06, "loss": 0.87182844, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 2.4878129959106445 }, { "auxiliary_loss_clip": 0.01042392, "auxiliary_loss_mlp": 0.01000422, "balance_loss_clip": 1.01508808, "balance_loss_mlp": 0.99920636, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.7088834820248641, "language_loss": 0.5232054, "learning_rate": 1.2651416978451063e-06, "loss": 0.54363358, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.1999776363372803 }, { "auxiliary_loss_clip": 0.01175047, "auxiliary_loss_mlp": 0.01025881, "balance_loss_clip": 1.05469072, "balance_loss_mlp": 1.01813221, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 1.7954949156765625, "language_loss": 0.64946747, "learning_rate": 1.2644172698420903e-06, "loss": 0.67147672, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.6572580337524414 }, { "auxiliary_loss_clip": 0.01128867, "auxiliary_loss_mlp": 0.01025845, "balance_loss_clip": 1.04681277, "balance_loss_mlp": 1.01833153, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 1.8926707312391093, "language_loss": 0.84562075, "learning_rate": 1.2636929534263892e-06, "loss": 0.86716783, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.5637598037719727 }, { "auxiliary_loss_clip": 0.01127453, "auxiliary_loss_mlp": 0.01028883, "balance_loss_clip": 1.04222584, "balance_loss_mlp": 1.02166438, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 1.6215485082009613, "language_loss": 0.77812272, "learning_rate": 1.2629687487078821e-06, "loss": 0.79968607, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 3.3799195289611816 }, { "auxiliary_loss_clip": 0.01157519, "auxiliary_loss_mlp": 0.01023315, "balance_loss_clip": 1.04858518, "balance_loss_mlp": 1.0161562, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 1.9671342806008167, "language_loss": 0.7660585, "learning_rate": 1.2622446557964293e-06, "loss": 0.78786683, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.509108781814575 }, { "auxiliary_loss_clip": 0.01141001, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.04520106, "balance_loss_mlp": 1.01812065, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 1.733373803407461, "language_loss": 0.71466649, "learning_rate": 1.261520674801876e-06, "loss": 0.73633444, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.6446657180786133 }, { "auxiliary_loss_clip": 0.01140635, "auxiliary_loss_mlp": 0.01028362, "balance_loss_clip": 1.05056453, "balance_loss_mlp": 1.0202204, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 2.465347172163128, "language_loss": 0.72640914, "learning_rate": 1.2607968058340488e-06, "loss": 0.74809909, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.6063601970672607 }, { "auxiliary_loss_clip": 0.01137228, "auxiliary_loss_mlp": 0.01024234, "balance_loss_clip": 1.045964, "balance_loss_mlp": 1.01728106, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.6619899516137349, "language_loss": 0.72924769, "learning_rate": 1.2600730490027583e-06, "loss": 0.75086236, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.586622714996338 }, { "auxiliary_loss_clip": 0.01129008, "auxiliary_loss_mlp": 0.01029954, "balance_loss_clip": 1.04696083, "balance_loss_mlp": 1.02269399, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 1.6945186845314215, "language_loss": 0.80235016, "learning_rate": 1.2593494044177984e-06, "loss": 0.8239398, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 2.553191661834717 }, { "auxiliary_loss_clip": 0.01172076, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.04980528, "balance_loss_mlp": 1.01907921, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 2.7746532501027885, "language_loss": 0.80743104, "learning_rate": 1.2586258721889448e-06, "loss": 0.82942522, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.438913345336914 }, { "auxiliary_loss_clip": 0.01105792, "auxiliary_loss_mlp": 0.01029974, "balance_loss_clip": 1.04590118, "balance_loss_mlp": 1.02233911, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 2.2843716298573917, "language_loss": 0.81504565, "learning_rate": 1.2579024524259573e-06, "loss": 0.83640337, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 2.570824384689331 }, { "auxiliary_loss_clip": 0.01135414, "auxiliary_loss_mlp": 0.01023085, "balance_loss_clip": 1.04289341, "balance_loss_mlp": 1.01602745, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 1.8595788246039904, "language_loss": 0.91569132, "learning_rate": 1.2571791452385768e-06, "loss": 0.93727636, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 3.3257579803466797 }, { "auxiliary_loss_clip": 0.01141395, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.04862928, "balance_loss_mlp": 1.01919639, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 1.8265499148477788, "language_loss": 0.7705543, "learning_rate": 1.2564559507365301e-06, "loss": 0.79222506, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 2.6189589500427246 }, { "auxiliary_loss_clip": 0.01143137, "auxiliary_loss_mlp": 0.01023816, "balance_loss_clip": 1.05031931, "balance_loss_mlp": 1.01613259, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.2016424927665823, "language_loss": 0.79216838, "learning_rate": 1.2557328690295244e-06, "loss": 0.81383789, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.5651912689208984 }, { "auxiliary_loss_clip": 0.01130835, "auxiliary_loss_mlp": 0.01024337, "balance_loss_clip": 1.04833591, "balance_loss_mlp": 1.01692843, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.5665022343446005, "language_loss": 0.76015413, "learning_rate": 1.255009900227251e-06, "loss": 0.7817058, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 3.4105286598205566 }, { "auxiliary_loss_clip": 0.01166635, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.05072403, "balance_loss_mlp": 1.018152, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 1.8962725714554596, "language_loss": 0.79651684, "learning_rate": 1.254287044439383e-06, "loss": 0.81843126, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 3.202737331390381 }, { "auxiliary_loss_clip": 0.01062243, "auxiliary_loss_mlp": 0.01004344, "balance_loss_clip": 1.01417518, "balance_loss_mlp": 1.00324178, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.780415206594517, "language_loss": 0.54503775, "learning_rate": 1.2535643017755776e-06, "loss": 0.56570363, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.1741766929626465 }, { "auxiliary_loss_clip": 0.01125211, "auxiliary_loss_mlp": 0.01026569, "balance_loss_clip": 1.04392219, "balance_loss_mlp": 1.01933002, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 2.553520083964701, "language_loss": 0.72059226, "learning_rate": 1.2528416723454737e-06, "loss": 0.74211007, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.5623018741607666 }, { "auxiliary_loss_clip": 0.01172493, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.05536699, "balance_loss_mlp": 1.02171516, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.4038826377435485, "language_loss": 0.71164888, "learning_rate": 1.2521191562586945e-06, "loss": 0.73366022, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 2.5977985858917236 }, { "auxiliary_loss_clip": 0.01168323, "auxiliary_loss_mlp": 0.00760945, "balance_loss_clip": 1.05116308, "balance_loss_mlp": 1.00019634, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 1.9642862195475197, "language_loss": 0.76787412, "learning_rate": 1.2513967536248445e-06, "loss": 0.78716683, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.4554340839385986 }, { "auxiliary_loss_clip": 0.01153462, "auxiliary_loss_mlp": 0.01033884, "balance_loss_clip": 1.0515604, "balance_loss_mlp": 1.02688956, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 1.725292107880279, "language_loss": 0.81158435, "learning_rate": 1.2506744645535117e-06, "loss": 0.83345783, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.518219232559204 }, { "auxiliary_loss_clip": 0.01131916, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.04156303, "balance_loss_mlp": 1.02186418, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 1.877429165637924, "language_loss": 0.60157049, "learning_rate": 1.249952289154267e-06, "loss": 0.62317592, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.549922466278076 }, { "auxiliary_loss_clip": 0.01086025, "auxiliary_loss_mlp": 0.01023256, "balance_loss_clip": 1.04093385, "balance_loss_mlp": 1.01630068, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 1.615336110551679, "language_loss": 0.7660858, "learning_rate": 1.2492302275366635e-06, "loss": 0.78717858, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.6407384872436523 }, { "auxiliary_loss_clip": 0.01148978, "auxiliary_loss_mlp": 0.01024192, "balance_loss_clip": 1.04731846, "balance_loss_mlp": 1.01576948, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 2.6687680457954506, "language_loss": 0.65455782, "learning_rate": 1.2485082798102377e-06, "loss": 0.67628944, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.5411806106567383 }, { "auxiliary_loss_clip": 0.01132202, "auxiliary_loss_mlp": 0.0102847, "balance_loss_clip": 1.04443288, "balance_loss_mlp": 1.02078128, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 2.048976169153386, "language_loss": 0.68498552, "learning_rate": 1.2477864460845084e-06, "loss": 0.7065922, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 2.5432846546173096 }, { "auxiliary_loss_clip": 0.01141896, "auxiliary_loss_mlp": 0.01029799, "balance_loss_clip": 1.04804111, "balance_loss_mlp": 1.02136743, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.88532119228345, "language_loss": 0.73068696, "learning_rate": 1.2470647264689776e-06, "loss": 0.75240386, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 2.520463466644287 }, { "auxiliary_loss_clip": 0.01101042, "auxiliary_loss_mlp": 0.01024554, "balance_loss_clip": 1.03979683, "balance_loss_mlp": 1.01730597, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 2.002164949840915, "language_loss": 0.70989579, "learning_rate": 1.2463431210731282e-06, "loss": 0.73115176, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.670114278793335 }, { "auxiliary_loss_clip": 0.01117937, "auxiliary_loss_mlp": 0.01026362, "balance_loss_clip": 1.04212427, "balance_loss_mlp": 1.01874161, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 2.5214791031613615, "language_loss": 0.76576108, "learning_rate": 1.2456216300064289e-06, "loss": 0.78720409, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.595416307449341 }, { "auxiliary_loss_clip": 0.01134302, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.04536629, "balance_loss_mlp": 1.02692342, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.737726130442711, "language_loss": 0.78408641, "learning_rate": 1.244900253378328e-06, "loss": 0.80578125, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.5708508491516113 }, { "auxiliary_loss_clip": 0.01073846, "auxiliary_loss_mlp": 0.01022183, "balance_loss_clip": 1.04371357, "balance_loss_mlp": 1.01538253, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 2.014718374703409, "language_loss": 0.69116527, "learning_rate": 1.2441789912982583e-06, "loss": 0.71212554, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.83880877494812 }, { "auxiliary_loss_clip": 0.01159478, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.05175853, "balance_loss_mlp": 1.01833844, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 2.408596342511094, "language_loss": 0.64826864, "learning_rate": 1.2434578438756346e-06, "loss": 0.67013049, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 3.3460628986358643 }, { "auxiliary_loss_clip": 0.01156665, "auxiliary_loss_mlp": 0.01022568, "balance_loss_clip": 1.04742002, "balance_loss_mlp": 1.01510525, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 1.926782960058554, "language_loss": 0.78127623, "learning_rate": 1.242736811219855e-06, "loss": 0.80306864, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 2.9064671993255615 }, { "auxiliary_loss_clip": 0.01149578, "auxiliary_loss_mlp": 0.010283, "balance_loss_clip": 1.04745221, "balance_loss_mlp": 1.02135015, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 1.6583067417777395, "language_loss": 0.82038379, "learning_rate": 1.2420158934402988e-06, "loss": 0.84216261, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.551565647125244 }, { "auxiliary_loss_clip": 0.01113948, "auxiliary_loss_mlp": 0.0102774, "balance_loss_clip": 1.0424459, "balance_loss_mlp": 1.01965201, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 1.8271357936703216, "language_loss": 0.84529698, "learning_rate": 1.2412950906463286e-06, "loss": 0.86671382, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.5827860832214355 }, { "auxiliary_loss_clip": 0.01116136, "auxiliary_loss_mlp": 0.01023427, "balance_loss_clip": 1.04631901, "balance_loss_mlp": 1.01660252, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 2.193792713707502, "language_loss": 0.90084457, "learning_rate": 1.2405744029472902e-06, "loss": 0.9222402, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 2.6503188610076904 }, { "auxiliary_loss_clip": 0.01138498, "auxiliary_loss_mlp": 0.01027846, "balance_loss_clip": 1.04782104, "balance_loss_mlp": 1.02052081, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 1.9441473537636285, "language_loss": 0.76425076, "learning_rate": 1.2398538304525108e-06, "loss": 0.78591418, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 2.5487475395202637 }, { "auxiliary_loss_clip": 0.01123924, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.04745722, "balance_loss_mlp": 1.02463543, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 2.2782106040304244, "language_loss": 0.75025249, "learning_rate": 1.2391333732713016e-06, "loss": 0.77181977, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 2.596153497695923 }, { "auxiliary_loss_clip": 0.01124427, "auxiliary_loss_mlp": 0.0102494, "balance_loss_clip": 1.04335284, "balance_loss_mlp": 1.01728082, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 2.1060014544051398, "language_loss": 0.79210305, "learning_rate": 1.2384130315129543e-06, "loss": 0.81359673, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 3.386382818222046 }, { "auxiliary_loss_clip": 0.01061647, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.03833008, "balance_loss_mlp": 1.02205873, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 1.991844715565865, "language_loss": 0.733805, "learning_rate": 1.2376928052867447e-06, "loss": 0.75471866, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 3.1269989013671875 }, { "auxiliary_loss_clip": 0.01145728, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.05085278, "balance_loss_mlp": 1.01790464, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 1.8368698816193973, "language_loss": 0.77488351, "learning_rate": 1.2369726947019299e-06, "loss": 0.79658836, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 2.7839736938476562 }, { "auxiliary_loss_clip": 0.0115172, "auxiliary_loss_mlp": 0.01025416, "balance_loss_clip": 1.04659235, "balance_loss_mlp": 1.01890087, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.1746476698967188, "language_loss": 0.67292738, "learning_rate": 1.2362526998677511e-06, "loss": 0.69469869, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 4.854312181472778 }, { "auxiliary_loss_clip": 0.01146013, "auxiliary_loss_mlp": 0.01028402, "balance_loss_clip": 1.04804385, "balance_loss_mlp": 1.02185392, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 1.6461488326199933, "language_loss": 0.84336793, "learning_rate": 1.2355328208934301e-06, "loss": 0.86511207, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.6122219562530518 }, { "auxiliary_loss_clip": 0.01155072, "auxiliary_loss_mlp": 0.00760837, "balance_loss_clip": 1.04703951, "balance_loss_mlp": 1.00023282, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 1.539212755255796, "language_loss": 0.72212434, "learning_rate": 1.2348130578881728e-06, "loss": 0.74128348, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 2.4878292083740234 }, { "auxiliary_loss_clip": 0.01172405, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.05200052, "balance_loss_mlp": 1.0223645, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 2.207314816324262, "language_loss": 0.76037192, "learning_rate": 1.2340934109611664e-06, "loss": 0.78239799, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 2.5352420806884766 }, { "auxiliary_loss_clip": 0.01144658, "auxiliary_loss_mlp": 0.01029554, "balance_loss_clip": 1.04801011, "balance_loss_mlp": 1.02126884, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 2.1454029544831412, "language_loss": 0.68477327, "learning_rate": 1.2333738802215798e-06, "loss": 0.70651543, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.5666635036468506 }, { "auxiliary_loss_clip": 0.01108014, "auxiliary_loss_mlp": 0.01035796, "balance_loss_clip": 1.04162312, "balance_loss_mlp": 1.02862537, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 1.9158997556390531, "language_loss": 0.81001699, "learning_rate": 1.2326544657785668e-06, "loss": 0.83145511, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.620126485824585 }, { "auxiliary_loss_clip": 0.01116684, "auxiliary_loss_mlp": 0.01028442, "balance_loss_clip": 1.04347181, "balance_loss_mlp": 1.02122366, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 2.194107931801718, "language_loss": 0.74390751, "learning_rate": 1.2319351677412608e-06, "loss": 0.76535869, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.549358367919922 }, { "auxiliary_loss_clip": 0.01134545, "auxiliary_loss_mlp": 0.01029238, "balance_loss_clip": 1.04663062, "balance_loss_mlp": 1.0219574, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 1.703136582340067, "language_loss": 0.73985112, "learning_rate": 1.2312159862187796e-06, "loss": 0.76148903, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.5587401390075684 }, { "auxiliary_loss_clip": 0.01172419, "auxiliary_loss_mlp": 0.01025827, "balance_loss_clip": 1.05201221, "balance_loss_mlp": 1.01770329, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 1.499902184555326, "language_loss": 0.75962514, "learning_rate": 1.2304969213202217e-06, "loss": 0.78160763, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.486853837966919 }, { "auxiliary_loss_clip": 0.01138533, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.04767323, "balance_loss_mlp": 1.02580833, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 2.2964027136958034, "language_loss": 0.79360092, "learning_rate": 1.2297779731546692e-06, "loss": 0.81531847, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.5833945274353027 }, { "auxiliary_loss_clip": 0.01143609, "auxiliary_loss_mlp": 0.01023112, "balance_loss_clip": 1.05226398, "balance_loss_mlp": 1.01621306, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 1.8573831777631826, "language_loss": 0.78000706, "learning_rate": 1.2290591418311853e-06, "loss": 0.80167425, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 2.536646842956543 }, { "auxiliary_loss_clip": 0.0115389, "auxiliary_loss_mlp": 0.01033407, "balance_loss_clip": 1.04873061, "balance_loss_mlp": 1.02631664, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 1.4926993010194123, "language_loss": 0.72275853, "learning_rate": 1.2283404274588172e-06, "loss": 0.74463153, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 2.5669572353363037 }, { "auxiliary_loss_clip": 0.00993237, "auxiliary_loss_mlp": 0.01001944, "balance_loss_clip": 1.01319146, "balance_loss_mlp": 1.00088906, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.7433207957837333, "language_loss": 0.527785, "learning_rate": 1.227621830146592e-06, "loss": 0.54773682, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.236919641494751 }, { "auxiliary_loss_clip": 0.01129018, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.04689705, "balance_loss_mlp": 1.02025306, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 1.8092996947499271, "language_loss": 0.79031444, "learning_rate": 1.2269033500035217e-06, "loss": 0.81188172, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 2.996455192565918 }, { "auxiliary_loss_clip": 0.0112503, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.04778111, "balance_loss_mlp": 1.01959991, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 1.8663737679600674, "language_loss": 0.74120653, "learning_rate": 1.2261849871385988e-06, "loss": 0.7627241, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.5912888050079346 }, { "auxiliary_loss_clip": 0.01172063, "auxiliary_loss_mlp": 0.01027325, "balance_loss_clip": 1.05189908, "balance_loss_mlp": 1.01987422, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 2.185380739485516, "language_loss": 0.62498164, "learning_rate": 1.2254667416607972e-06, "loss": 0.64697552, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 3.8812804222106934 }, { "auxiliary_loss_clip": 0.01156201, "auxiliary_loss_mlp": 0.01028128, "balance_loss_clip": 1.05010748, "balance_loss_mlp": 1.02041495, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 1.8849406424888104, "language_loss": 0.82934165, "learning_rate": 1.2247486136790756e-06, "loss": 0.85118496, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.552899122238159 }, { "auxiliary_loss_clip": 0.01157715, "auxiliary_loss_mlp": 0.01031763, "balance_loss_clip": 1.05034399, "balance_loss_mlp": 1.0243305, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.292850998365218, "language_loss": 0.80222344, "learning_rate": 1.2240306033023726e-06, "loss": 0.82411826, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.489532232284546 }, { "auxiliary_loss_clip": 0.01128256, "auxiliary_loss_mlp": 0.0102778, "balance_loss_clip": 1.04167151, "balance_loss_mlp": 1.02080059, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 2.343393505043547, "language_loss": 0.71983123, "learning_rate": 1.223312710639611e-06, "loss": 0.7413916, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.566988468170166 }, { "auxiliary_loss_clip": 0.01140645, "auxiliary_loss_mlp": 0.01025856, "balance_loss_clip": 1.04869556, "balance_loss_mlp": 1.01803839, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 2.060896434759809, "language_loss": 0.87113333, "learning_rate": 1.2225949357996928e-06, "loss": 0.8927983, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.531541585922241 }, { "auxiliary_loss_clip": 0.01152582, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.0505985, "balance_loss_mlp": 1.0258379, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 1.744613598380152, "language_loss": 0.80231178, "learning_rate": 1.221877278891505e-06, "loss": 0.82416725, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 2.5497078895568848 }, { "auxiliary_loss_clip": 0.01158983, "auxiliary_loss_mlp": 0.01030384, "balance_loss_clip": 1.05230653, "balance_loss_mlp": 1.0223434, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 1.9288654505998788, "language_loss": 0.71635884, "learning_rate": 1.221159740023915e-06, "loss": 0.73825252, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 2.5483405590057373 }, { "auxiliary_loss_clip": 0.01136718, "auxiliary_loss_mlp": 0.00761116, "balance_loss_clip": 1.04830408, "balance_loss_mlp": 1.00022054, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 1.881137350257827, "language_loss": 0.72880101, "learning_rate": 1.2204423193057735e-06, "loss": 0.74777937, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 2.607191324234009 }, { "auxiliary_loss_clip": 0.01041303, "auxiliary_loss_mlp": 0.01001608, "balance_loss_clip": 1.01267374, "balance_loss_mlp": 1.00055909, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.8584354543625721, "language_loss": 0.63398916, "learning_rate": 1.2197250168459122e-06, "loss": 0.65441823, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 3.903045892715454 }, { "auxiliary_loss_clip": 0.01158273, "auxiliary_loss_mlp": 0.01029101, "balance_loss_clip": 1.04963994, "balance_loss_mlp": 1.02182364, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 1.7338731332971495, "language_loss": 0.7436738, "learning_rate": 1.2190078327531454e-06, "loss": 0.76554757, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.478257894515991 }, { "auxiliary_loss_clip": 0.01154777, "auxiliary_loss_mlp": 0.01030104, "balance_loss_clip": 1.04784405, "balance_loss_mlp": 1.02317774, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 1.493586880893391, "language_loss": 0.7272464, "learning_rate": 1.2182907671362697e-06, "loss": 0.7490952, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 2.530974864959717 }, { "auxiliary_loss_clip": 0.01157423, "auxiliary_loss_mlp": 0.01029146, "balance_loss_clip": 1.05184221, "balance_loss_mlp": 1.02188015, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 1.8606054502117242, "language_loss": 0.78452051, "learning_rate": 1.2175738201040626e-06, "loss": 0.80638623, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 4.0692524909973145 }, { "auxiliary_loss_clip": 0.01156175, "auxiliary_loss_mlp": 0.01027996, "balance_loss_clip": 1.05044675, "balance_loss_mlp": 1.02051544, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 1.6015531886200642, "language_loss": 0.78554583, "learning_rate": 1.2168569917652855e-06, "loss": 0.80738747, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.5467865467071533 }, { "auxiliary_loss_clip": 0.01154443, "auxiliary_loss_mlp": 0.01025294, "balance_loss_clip": 1.04961967, "balance_loss_mlp": 1.01815343, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 1.6070525992818563, "language_loss": 0.63701081, "learning_rate": 1.2161402822286797e-06, "loss": 0.65880811, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 2.5565969944000244 }, { "auxiliary_loss_clip": 0.01126601, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.04617918, "balance_loss_mlp": 1.01987159, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 1.9517276936326482, "language_loss": 0.7865243, "learning_rate": 1.2154236916029703e-06, "loss": 0.80806518, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 2.5691888332366943 }, { "auxiliary_loss_clip": 0.01111736, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.0401715, "balance_loss_mlp": 1.02212667, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 2.096708933658815, "language_loss": 0.73253292, "learning_rate": 1.2147072199968627e-06, "loss": 0.75394416, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.578903913497925 }, { "auxiliary_loss_clip": 0.01154714, "auxiliary_loss_mlp": 0.01024417, "balance_loss_clip": 1.04996204, "balance_loss_mlp": 1.01769412, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 1.8064551682845527, "language_loss": 0.7100569, "learning_rate": 1.2139908675190454e-06, "loss": 0.73184824, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.5008301734924316 }, { "auxiliary_loss_clip": 0.01090968, "auxiliary_loss_mlp": 0.01028118, "balance_loss_clip": 1.03939795, "balance_loss_mlp": 1.02067947, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 1.8653591746839606, "language_loss": 0.74788666, "learning_rate": 1.2132746342781883e-06, "loss": 0.76907748, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.666193962097168 }, { "auxiliary_loss_clip": 0.01173198, "auxiliary_loss_mlp": 0.0103672, "balance_loss_clip": 1.05298245, "balance_loss_mlp": 1.02909017, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 2.2567669373505512, "language_loss": 0.79549533, "learning_rate": 1.2125585203829442e-06, "loss": 0.81759453, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.4794082641601562 }, { "auxiliary_loss_clip": 0.01115329, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.04640651, "balance_loss_mlp": 1.01800728, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 1.9052960527467953, "language_loss": 0.73815221, "learning_rate": 1.211842525941946e-06, "loss": 0.75955981, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.5815987586975098 }, { "auxiliary_loss_clip": 0.01109879, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 1.04437089, "balance_loss_mlp": 1.02047217, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 1.7496701544823012, "language_loss": 0.78863549, "learning_rate": 1.2111266510638105e-06, "loss": 0.81001252, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 2.809509515762329 }, { "auxiliary_loss_clip": 0.01090346, "auxiliary_loss_mlp": 0.01025716, "balance_loss_clip": 1.04070365, "balance_loss_mlp": 1.01793206, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 1.6622761084676105, "language_loss": 0.8002643, "learning_rate": 1.2104108958571346e-06, "loss": 0.82142496, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 2.6218109130859375 }, { "auxiliary_loss_clip": 0.01155688, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.05072904, "balance_loss_mlp": 1.01886344, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.4960894967485026, "language_loss": 0.75767696, "learning_rate": 1.2096952604304975e-06, "loss": 0.77949083, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.5579049587249756 }, { "auxiliary_loss_clip": 0.01155046, "auxiliary_loss_mlp": 0.0102907, "balance_loss_clip": 1.0473597, "balance_loss_mlp": 1.02133942, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 2.2826015944220206, "language_loss": 0.70150238, "learning_rate": 1.2089797448924616e-06, "loss": 0.72334361, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 2.654397487640381 }, { "auxiliary_loss_clip": 0.01116342, "auxiliary_loss_mlp": 0.01026772, "balance_loss_clip": 1.0433836, "balance_loss_mlp": 1.01914811, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 2.646278135927872, "language_loss": 0.65968931, "learning_rate": 1.2082643493515692e-06, "loss": 0.68112046, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.603133201599121 }, { "auxiliary_loss_clip": 0.01157849, "auxiliary_loss_mlp": 0.01027113, "balance_loss_clip": 1.05230606, "balance_loss_mlp": 1.01984131, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 2.6633337354509594, "language_loss": 0.81659806, "learning_rate": 1.207549073916346e-06, "loss": 0.83844763, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.5119776725769043 }, { "auxiliary_loss_clip": 0.01134814, "auxiliary_loss_mlp": 0.01028181, "balance_loss_clip": 1.04895616, "balance_loss_mlp": 1.02123451, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 2.026905653638198, "language_loss": 0.77309245, "learning_rate": 1.2068339186952976e-06, "loss": 0.79472244, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 3.326878547668457 }, { "auxiliary_loss_clip": 0.01160348, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.05142212, "balance_loss_mlp": 1.02049971, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 1.7485904327909214, "language_loss": 0.7268889, "learning_rate": 1.2061188837969136e-06, "loss": 0.74877715, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.4940669536590576 }, { "auxiliary_loss_clip": 0.01121398, "auxiliary_loss_mlp": 0.01030013, "balance_loss_clip": 1.04219723, "balance_loss_mlp": 1.02206802, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 3.0693600279795263, "language_loss": 0.84271312, "learning_rate": 1.2054039693296631e-06, "loss": 0.86422724, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.5751919746398926 }, { "auxiliary_loss_clip": 0.01120564, "auxiliary_loss_mlp": 0.01029037, "balance_loss_clip": 1.04354858, "balance_loss_mlp": 1.02185714, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 1.694619781669432, "language_loss": 0.8184185, "learning_rate": 1.2046891754019992e-06, "loss": 0.83991456, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.5821332931518555 }, { "auxiliary_loss_clip": 0.01160501, "auxiliary_loss_mlp": 0.01027716, "balance_loss_clip": 1.05245495, "balance_loss_mlp": 1.02010703, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 2.0720602602752347, "language_loss": 0.82326102, "learning_rate": 1.2039745021223548e-06, "loss": 0.8451432, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.4994943141937256 }, { "auxiliary_loss_clip": 0.01019736, "auxiliary_loss_mlp": 0.01002474, "balance_loss_clip": 1.01732373, "balance_loss_mlp": 1.00120413, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.7952841622587897, "language_loss": 0.5706771, "learning_rate": 1.2032599495991456e-06, "loss": 0.59089917, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 3.274405002593994 }, { "auxiliary_loss_clip": 0.01160656, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 1.05311525, "balance_loss_mlp": 1.0192138, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 1.7944699092836465, "language_loss": 0.69842386, "learning_rate": 1.2025455179407685e-06, "loss": 0.72030163, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 2.732163667678833 }, { "auxiliary_loss_clip": 0.01152765, "auxiliary_loss_mlp": 0.00761228, "balance_loss_clip": 1.04901671, "balance_loss_mlp": 1.0002048, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 1.776341706315445, "language_loss": 0.73793161, "learning_rate": 1.2018312072556022e-06, "loss": 0.75707155, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 2.5288796424865723 }, { "auxiliary_loss_clip": 0.01169635, "auxiliary_loss_mlp": 0.00761228, "balance_loss_clip": 1.05173528, "balance_loss_mlp": 1.00021267, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 2.3045715371385223, "language_loss": 0.74091148, "learning_rate": 1.2011170176520077e-06, "loss": 0.76022011, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 3.2635276317596436 }, { "auxiliary_loss_clip": 0.01086764, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.04205465, "balance_loss_mlp": 1.02581263, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.4722780255137728, "language_loss": 0.81080997, "learning_rate": 1.2004029492383256e-06, "loss": 0.83200854, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.6622889041900635 }, { "auxiliary_loss_clip": 0.01156906, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.05168462, "balance_loss_mlp": 1.01901627, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 1.845981060939152, "language_loss": 0.7378515, "learning_rate": 1.1996890021228814e-06, "loss": 0.75968844, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 3.3060555458068848 }, { "auxiliary_loss_clip": 0.01137858, "auxiliary_loss_mlp": 0.01025032, "balance_loss_clip": 1.04565334, "balance_loss_mlp": 1.01773071, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.4828407652835038, "language_loss": 0.69936657, "learning_rate": 1.1989751764139785e-06, "loss": 0.72099543, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.7215371131896973 }, { "auxiliary_loss_clip": 0.0111086, "auxiliary_loss_mlp": 0.01020855, "balance_loss_clip": 1.04008222, "balance_loss_mlp": 1.01370251, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.7145142539739229, "language_loss": 0.8339873, "learning_rate": 1.1982614722199044e-06, "loss": 0.85530442, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 3.3978445529937744 }, { "auxiliary_loss_clip": 0.01145292, "auxiliary_loss_mlp": 0.01024699, "balance_loss_clip": 1.04703283, "balance_loss_mlp": 1.01783252, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 2.0713618399692355, "language_loss": 0.77812189, "learning_rate": 1.1975478896489276e-06, "loss": 0.79982173, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.5136735439300537 }, { "auxiliary_loss_clip": 0.01169001, "auxiliary_loss_mlp": 0.01024328, "balance_loss_clip": 1.05041587, "balance_loss_mlp": 1.0172919, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 1.8741224024659697, "language_loss": 0.76482832, "learning_rate": 1.1968344288092981e-06, "loss": 0.78676164, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 2.476367473602295 }, { "auxiliary_loss_clip": 0.01157202, "auxiliary_loss_mlp": 0.00761397, "balance_loss_clip": 1.05154371, "balance_loss_mlp": 1.00023222, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 1.7183875266634723, "language_loss": 0.64680231, "learning_rate": 1.1961210898092468e-06, "loss": 0.66598833, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.5369062423706055 }, { "auxiliary_loss_clip": 0.01152069, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.05145597, "balance_loss_mlp": 1.02212226, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 2.335823856246759, "language_loss": 0.79095185, "learning_rate": 1.1954078727569874e-06, "loss": 0.81276834, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.5327959060668945 }, { "auxiliary_loss_clip": 0.01133006, "auxiliary_loss_mlp": 0.00760958, "balance_loss_clip": 1.04581511, "balance_loss_mlp": 1.00024271, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.7657001314341272, "language_loss": 0.77960992, "learning_rate": 1.1946947777607141e-06, "loss": 0.79854953, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.5912585258483887 }, { "auxiliary_loss_clip": 0.0110736, "auxiliary_loss_mlp": 0.0102867, "balance_loss_clip": 1.04359448, "balance_loss_mlp": 1.02148438, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 1.8490055534702332, "language_loss": 0.79921544, "learning_rate": 1.1939818049286024e-06, "loss": 0.82057571, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.6298482418060303 }, { "auxiliary_loss_clip": 0.0109005, "auxiliary_loss_mlp": 0.01032197, "balance_loss_clip": 1.04339373, "balance_loss_mlp": 1.02480912, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 1.515994914793429, "language_loss": 0.75695705, "learning_rate": 1.1932689543688101e-06, "loss": 0.77817953, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.649275779724121 }, { "auxiliary_loss_clip": 0.01141227, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.04952312, "balance_loss_mlp": 1.01851511, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 1.7424819569520504, "language_loss": 0.72868478, "learning_rate": 1.1925562261894756e-06, "loss": 0.75036228, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.5542449951171875 }, { "auxiliary_loss_clip": 0.01137557, "auxiliary_loss_mlp": 0.01031016, "balance_loss_clip": 1.04685187, "balance_loss_mlp": 1.02422118, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 1.8190494357330105, "language_loss": 0.77534831, "learning_rate": 1.1918436204987207e-06, "loss": 0.79703408, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 2.6096158027648926 }, { "auxiliary_loss_clip": 0.01150767, "auxiliary_loss_mlp": 0.01025251, "balance_loss_clip": 1.05028403, "balance_loss_mlp": 1.01829529, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 2.2447958918494764, "language_loss": 0.81698859, "learning_rate": 1.191131137404645e-06, "loss": 0.83874869, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 2.489264965057373 }, { "auxiliary_loss_clip": 0.01117981, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.04570544, "balance_loss_mlp": 1.02562809, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 2.017726852567579, "language_loss": 0.76927447, "learning_rate": 1.190418777015333e-06, "loss": 0.79078728, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.5625014305114746 }, { "auxiliary_loss_clip": 0.0114126, "auxiliary_loss_mlp": 0.01029248, "balance_loss_clip": 1.04857409, "balance_loss_mlp": 1.02254891, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 1.3675177501409728, "language_loss": 0.73383647, "learning_rate": 1.1897065394388487e-06, "loss": 0.75554156, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.565554141998291 }, { "auxiliary_loss_clip": 0.01149425, "auxiliary_loss_mlp": 0.01041789, "balance_loss_clip": 1.0548408, "balance_loss_mlp": 1.03396893, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.5874281578699188, "language_loss": 0.76489425, "learning_rate": 1.1889944247832385e-06, "loss": 0.78680634, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.564833164215088 }, { "auxiliary_loss_clip": 0.01157221, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 1.04738903, "balance_loss_mlp": 1.01777887, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 1.7941243661368356, "language_loss": 0.70818055, "learning_rate": 1.1882824331565283e-06, "loss": 0.73000479, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 3.39786696434021 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.01028567, "balance_loss_clip": 1.04295182, "balance_loss_mlp": 1.02131927, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.0687142856011964, "language_loss": 0.89636707, "learning_rate": 1.1875705646667287e-06, "loss": 0.91786861, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.563932418823242 }, { "auxiliary_loss_clip": 0.01151178, "auxiliary_loss_mlp": 0.01027769, "balance_loss_clip": 1.04577005, "balance_loss_mlp": 1.02047598, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 2.047804123698494, "language_loss": 0.75786555, "learning_rate": 1.1868588194218282e-06, "loss": 0.77965498, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.5637528896331787 }, { "auxiliary_loss_clip": 0.01149667, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.04955506, "balance_loss_mlp": 1.0250349, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 1.5878208440473687, "language_loss": 0.74121487, "learning_rate": 1.1861471975297979e-06, "loss": 0.76303232, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.6340930461883545 }, { "auxiliary_loss_clip": 0.01125735, "auxiliary_loss_mlp": 0.01027455, "balance_loss_clip": 1.04814744, "balance_loss_mlp": 1.01983142, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 1.565260689621832, "language_loss": 0.70743787, "learning_rate": 1.185435699098591e-06, "loss": 0.72896975, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.7744717597961426 }, { "auxiliary_loss_clip": 0.01144947, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.04786217, "balance_loss_mlp": 1.02510548, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 2.266588614968, "language_loss": 0.78247392, "learning_rate": 1.1847243242361403e-06, "loss": 0.80425358, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 2.5077455043792725 }, { "auxiliary_loss_clip": 0.01140416, "auxiliary_loss_mlp": 0.01024999, "balance_loss_clip": 1.04631257, "balance_loss_mlp": 1.0175159, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.8848428232194765, "language_loss": 0.77935326, "learning_rate": 1.1840130730503624e-06, "loss": 0.80100745, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 2.636784791946411 }, { "auxiliary_loss_clip": 0.01172448, "auxiliary_loss_mlp": 0.01025049, "balance_loss_clip": 1.05288577, "balance_loss_mlp": 1.01820993, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 2.1675308976534486, "language_loss": 0.75112426, "learning_rate": 1.1833019456491518e-06, "loss": 0.7730993, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 2.4969983100891113 }, { "auxiliary_loss_clip": 0.01155973, "auxiliary_loss_mlp": 0.01028224, "balance_loss_clip": 1.0505898, "balance_loss_mlp": 1.02075005, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 2.6162378884423942, "language_loss": 0.78477418, "learning_rate": 1.1825909421403871e-06, "loss": 0.80661619, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 3.3288347721099854 }, { "auxiliary_loss_clip": 0.01156662, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.04946232, "balance_loss_mlp": 1.02486312, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 1.6964544267802024, "language_loss": 0.76390147, "learning_rate": 1.181880062631926e-06, "loss": 0.78578913, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 2.5154478549957275 }, { "auxiliary_loss_clip": 0.01134966, "auxiliary_loss_mlp": 0.01029327, "balance_loss_clip": 1.04704237, "balance_loss_mlp": 1.02197433, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 1.9628322367629005, "language_loss": 0.84476554, "learning_rate": 1.1811693072316093e-06, "loss": 0.86640847, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 3.3790407180786133 }, { "auxiliary_loss_clip": 0.01169515, "auxiliary_loss_mlp": 0.00761505, "balance_loss_clip": 1.05032754, "balance_loss_mlp": 1.00021553, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.4848752931977396, "language_loss": 0.84082818, "learning_rate": 1.1804586760472574e-06, "loss": 0.86013836, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 3.2257301807403564 }, { "auxiliary_loss_clip": 0.01124829, "auxiliary_loss_mlp": 0.01024236, "balance_loss_clip": 1.04410255, "balance_loss_mlp": 1.01684546, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 2.0650440432238377, "language_loss": 0.80448234, "learning_rate": 1.1797481691866736e-06, "loss": 0.82597297, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.5998034477233887 }, { "auxiliary_loss_clip": 0.01130752, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.04577196, "balance_loss_mlp": 1.02198291, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 1.9576451787498792, "language_loss": 0.83128327, "learning_rate": 1.1790377867576393e-06, "loss": 0.8528868, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 2.5251669883728027 }, { "auxiliary_loss_clip": 0.01144125, "auxiliary_loss_mlp": 0.01034692, "balance_loss_clip": 1.04604614, "balance_loss_mlp": 1.02761149, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 1.8728559089025174, "language_loss": 0.76564741, "learning_rate": 1.1783275288679203e-06, "loss": 0.78743559, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 2.580638885498047 }, { "auxiliary_loss_clip": 0.010534, "auxiliary_loss_mlp": 0.01002279, "balance_loss_clip": 1.01363826, "balance_loss_mlp": 1.00117671, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.840903151825484, "language_loss": 0.57098281, "learning_rate": 1.177617395625262e-06, "loss": 0.59153962, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.059608221054077 }, { "auxiliary_loss_clip": 0.01156268, "auxiliary_loss_mlp": 0.01026136, "balance_loss_clip": 1.04993773, "balance_loss_mlp": 1.01873875, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 1.7923630342951629, "language_loss": 0.75284189, "learning_rate": 1.1769073871373908e-06, "loss": 0.77466595, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.5448827743530273 }, { "auxiliary_loss_clip": 0.01122155, "auxiliary_loss_mlp": 0.01025524, "balance_loss_clip": 1.0416826, "balance_loss_mlp": 1.01822567, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 2.4945505635053427, "language_loss": 0.83891279, "learning_rate": 1.176197503512015e-06, "loss": 0.86038959, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.5889172554016113 }, { "auxiliary_loss_clip": 0.01138557, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.04721987, "balance_loss_mlp": 1.0223701, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.83986386428047, "language_loss": 0.82229161, "learning_rate": 1.1754877448568223e-06, "loss": 0.84396774, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.549473524093628 }, { "auxiliary_loss_clip": 0.01142066, "auxiliary_loss_mlp": 0.01023371, "balance_loss_clip": 1.04834247, "balance_loss_mlp": 1.01625133, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 1.88125963598905, "language_loss": 0.90054625, "learning_rate": 1.1747781112794837e-06, "loss": 0.92220062, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.5463366508483887 }, { "auxiliary_loss_clip": 0.01123557, "auxiliary_loss_mlp": 0.0103128, "balance_loss_clip": 1.04590499, "balance_loss_mlp": 1.02451825, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.7577051076550172, "language_loss": 0.83150899, "learning_rate": 1.1740686028876487e-06, "loss": 0.85305738, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.6011385917663574 }, { "auxiliary_loss_clip": 0.01153916, "auxiliary_loss_mlp": 0.01025795, "balance_loss_clip": 1.05021322, "balance_loss_mlp": 1.01888382, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 2.04859452399454, "language_loss": 0.74640608, "learning_rate": 1.1733592197889507e-06, "loss": 0.76820314, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 2.48874568939209 }, { "auxiliary_loss_clip": 0.01150347, "auxiliary_loss_mlp": 0.01025677, "balance_loss_clip": 1.05021811, "balance_loss_mlp": 1.0192188, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 1.944513847269682, "language_loss": 0.7237671, "learning_rate": 1.1726499620910014e-06, "loss": 0.74552733, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.5738139152526855 }, { "auxiliary_loss_clip": 0.0115401, "auxiliary_loss_mlp": 0.01028624, "balance_loss_clip": 1.04959941, "balance_loss_mlp": 1.0209707, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 2.0835119195803498, "language_loss": 0.7804997, "learning_rate": 1.1719408299013955e-06, "loss": 0.80232602, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.4694271087646484 }, { "auxiliary_loss_clip": 0.011681, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.05111098, "balance_loss_mlp": 1.02725005, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 2.748265973259102, "language_loss": 0.75764418, "learning_rate": 1.1712318233277067e-06, "loss": 0.77967232, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.4712114334106445 }, { "auxiliary_loss_clip": 0.01050682, "auxiliary_loss_mlp": 0.01001491, "balance_loss_clip": 1.012797, "balance_loss_mlp": 1.0003581, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.7573884803135872, "language_loss": 0.57891333, "learning_rate": 1.1705229424774916e-06, "loss": 0.59943503, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.771580457687378 }, { "auxiliary_loss_clip": 0.01138316, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.04495907, "balance_loss_mlp": 1.02702153, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 1.7029346590767969, "language_loss": 0.63955659, "learning_rate": 1.1698141874582867e-06, "loss": 0.66128474, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.6193666458129883 }, { "auxiliary_loss_clip": 0.0116766, "auxiliary_loss_mlp": 0.01023194, "balance_loss_clip": 1.05160201, "balance_loss_mlp": 1.01642537, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 2.0011978223460902, "language_loss": 0.7210477, "learning_rate": 1.169105558377609e-06, "loss": 0.74295622, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.4630322456359863 }, { "auxiliary_loss_clip": 0.0110784, "auxiliary_loss_mlp": 0.00761117, "balance_loss_clip": 1.04874921, "balance_loss_mlp": 1.00021172, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 1.6018677460296296, "language_loss": 0.78458279, "learning_rate": 1.1683970553429587e-06, "loss": 0.80327237, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.655327558517456 }, { "auxiliary_loss_clip": 0.01133669, "auxiliary_loss_mlp": 0.01028447, "balance_loss_clip": 1.04844522, "balance_loss_mlp": 1.02101719, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 1.9621136299950048, "language_loss": 0.82528836, "learning_rate": 1.1676886784618128e-06, "loss": 0.84690952, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.5378496646881104 }, { "auxiliary_loss_clip": 0.01154535, "auxiliary_loss_mlp": 0.01025949, "balance_loss_clip": 1.05010974, "balance_loss_mlp": 1.0174849, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.0194653023193334, "language_loss": 0.84103811, "learning_rate": 1.1669804278416332e-06, "loss": 0.86284292, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.491683006286621 }, { "auxiliary_loss_clip": 0.01146968, "auxiliary_loss_mlp": 0.01031282, "balance_loss_clip": 1.0502553, "balance_loss_mlp": 1.02423656, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 2.136906632592689, "language_loss": 0.71161228, "learning_rate": 1.1662723035898602e-06, "loss": 0.73339486, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 2.520371437072754 }, { "auxiliary_loss_clip": 0.011551, "auxiliary_loss_mlp": 0.01028264, "balance_loss_clip": 1.04968417, "balance_loss_mlp": 1.02049172, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 1.5729017838627806, "language_loss": 0.81829178, "learning_rate": 1.165564305813915e-06, "loss": 0.84012544, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 2.5456418991088867 }, { "auxiliary_loss_clip": 0.01154641, "auxiliary_loss_mlp": 0.01026778, "balance_loss_clip": 1.04931998, "balance_loss_mlp": 1.01965487, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 1.7666772639743735, "language_loss": 0.8065573, "learning_rate": 1.1648564346212019e-06, "loss": 0.82837141, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 2.483328104019165 }, { "auxiliary_loss_clip": 0.011511, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.05027759, "balance_loss_mlp": 1.02178049, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 1.8870826808943437, "language_loss": 0.76030773, "learning_rate": 1.164148690119104e-06, "loss": 0.78211206, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 3.345306873321533 }, { "auxiliary_loss_clip": 0.01166615, "auxiliary_loss_mlp": 0.01025718, "balance_loss_clip": 1.04954386, "balance_loss_mlp": 1.01864314, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 1.6891317099687795, "language_loss": 0.73865342, "learning_rate": 1.163441072414985e-06, "loss": 0.76057673, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.4691548347473145 }, { "auxiliary_loss_clip": 0.01155966, "auxiliary_loss_mlp": 0.01029089, "balance_loss_clip": 1.05210853, "balance_loss_mlp": 1.02157032, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 1.9334896599943103, "language_loss": 0.69779927, "learning_rate": 1.16273358161619e-06, "loss": 0.71964985, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 3.3189268112182617 }, { "auxiliary_loss_clip": 0.01146759, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.04942632, "balance_loss_mlp": 1.02381659, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 1.7630131578834347, "language_loss": 0.8403827, "learning_rate": 1.1620262178300446e-06, "loss": 0.86216539, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 3.458660840988159 }, { "auxiliary_loss_clip": 0.01124631, "auxiliary_loss_mlp": 0.01023857, "balance_loss_clip": 1.04270959, "balance_loss_mlp": 1.01620984, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 1.7327327100683878, "language_loss": 0.75736743, "learning_rate": 1.1613189811638563e-06, "loss": 0.77885234, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.64713454246521 }, { "auxiliary_loss_clip": 0.01155944, "auxiliary_loss_mlp": 0.01032709, "balance_loss_clip": 1.04936719, "balance_loss_mlp": 1.02552652, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.5644936170146648, "language_loss": 0.77938628, "learning_rate": 1.1606118717249117e-06, "loss": 0.80127281, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 2.5174057483673096 }, { "auxiliary_loss_clip": 0.0117224, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.05043483, "balance_loss_mlp": 1.01934767, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.6827983373764588, "language_loss": 0.67827934, "learning_rate": 1.1599048896204787e-06, "loss": 0.70027089, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.4828875064849854 }, { "auxiliary_loss_clip": 0.01126596, "auxiliary_loss_mlp": 0.01025089, "balance_loss_clip": 1.04460692, "balance_loss_mlp": 1.01805007, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 1.672253267194269, "language_loss": 0.80777907, "learning_rate": 1.1591980349578061e-06, "loss": 0.82929593, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 2.570786237716675 }, { "auxiliary_loss_clip": 0.01029758, "auxiliary_loss_mlp": 0.01005511, "balance_loss_clip": 1.01211357, "balance_loss_mlp": 1.00435448, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.732164568434597, "language_loss": 0.54363608, "learning_rate": 1.158491307844123e-06, "loss": 0.5639888, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.1408488750457764 }, { "auxiliary_loss_clip": 0.01139231, "auxiliary_loss_mlp": 0.01024503, "balance_loss_clip": 1.04820168, "balance_loss_mlp": 1.01740146, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 1.58154504178718, "language_loss": 0.83944654, "learning_rate": 1.1577847083866387e-06, "loss": 0.86108387, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.51816725730896 }, { "auxiliary_loss_clip": 0.0112982, "auxiliary_loss_mlp": 0.0102552, "balance_loss_clip": 1.04548097, "balance_loss_mlp": 1.01805723, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 1.6907219860000646, "language_loss": 0.71921897, "learning_rate": 1.1570782366925453e-06, "loss": 0.74077231, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.5119876861572266 }, { "auxiliary_loss_clip": 0.01141029, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.04288018, "balance_loss_mlp": 1.02078557, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 1.5891434170505723, "language_loss": 0.75683439, "learning_rate": 1.1563718928690132e-06, "loss": 0.7785306, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.5209498405456543 }, { "auxiliary_loss_clip": 0.01129183, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.04732728, "balance_loss_mlp": 1.02300453, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 2.1338126646907294, "language_loss": 0.71291423, "learning_rate": 1.1556656770231942e-06, "loss": 0.73451453, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 2.5493288040161133 }, { "auxiliary_loss_clip": 0.01154696, "auxiliary_loss_mlp": 0.0102483, "balance_loss_clip": 1.04731405, "balance_loss_mlp": 1.01768029, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 1.6662418293025814, "language_loss": 0.76284921, "learning_rate": 1.1549595892622207e-06, "loss": 0.78464448, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 2.508307456970215 }, { "auxiliary_loss_clip": 0.01013712, "auxiliary_loss_mlp": 0.01000794, "balance_loss_clip": 1.01367521, "balance_loss_mlp": 0.99973357, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.8243421932327678, "language_loss": 0.59035981, "learning_rate": 1.1542536296932047e-06, "loss": 0.61050487, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.1260552406311035 }, { "auxiliary_loss_clip": 0.01133865, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.04579735, "balance_loss_mlp": 1.02757549, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 2.087644935859129, "language_loss": 0.69894159, "learning_rate": 1.1535477984232414e-06, "loss": 0.72063476, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.5685324668884277 }, { "auxiliary_loss_clip": 0.01109882, "auxiliary_loss_mlp": 0.01023637, "balance_loss_clip": 1.03946042, "balance_loss_mlp": 1.01650524, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.8103330189807276, "language_loss": 0.76580441, "learning_rate": 1.152842095559404e-06, "loss": 0.78713965, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.6338613033294678 }, { "auxiliary_loss_clip": 0.01143032, "auxiliary_loss_mlp": 0.01025991, "balance_loss_clip": 1.04552042, "balance_loss_mlp": 1.01933038, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 1.6071461418306918, "language_loss": 0.76477838, "learning_rate": 1.1521365212087474e-06, "loss": 0.78646863, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 3.3873884677886963 }, { "auxiliary_loss_clip": 0.01154704, "auxiliary_loss_mlp": 0.01027634, "balance_loss_clip": 1.04838824, "balance_loss_mlp": 1.02023101, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 1.7393072622893384, "language_loss": 0.70943433, "learning_rate": 1.1514310754783062e-06, "loss": 0.73125768, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 2.697362184524536 }, { "auxiliary_loss_clip": 0.01143671, "auxiliary_loss_mlp": 0.01027476, "balance_loss_clip": 1.04888296, "balance_loss_mlp": 1.02031779, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 1.9356839266032662, "language_loss": 0.73135942, "learning_rate": 1.1507257584750964e-06, "loss": 0.75307095, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.577609062194824 }, { "auxiliary_loss_clip": 0.01169387, "auxiliary_loss_mlp": 0.01027761, "balance_loss_clip": 1.05169737, "balance_loss_mlp": 1.0200038, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 1.8243131288719279, "language_loss": 0.77342844, "learning_rate": 1.150020570306113e-06, "loss": 0.7953999, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.46199631690979 }, { "auxiliary_loss_clip": 0.01133742, "auxiliary_loss_mlp": 0.01026627, "balance_loss_clip": 1.0427382, "balance_loss_mlp": 1.01870835, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 1.8752421526738936, "language_loss": 0.74819291, "learning_rate": 1.1493155110783338e-06, "loss": 0.76979661, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.506695508956909 }, { "auxiliary_loss_clip": 0.01155279, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.04927921, "balance_loss_mlp": 1.02131724, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 1.9712439333586247, "language_loss": 0.70351505, "learning_rate": 1.1486105808987155e-06, "loss": 0.72535837, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.5535600185394287 }, { "auxiliary_loss_clip": 0.01156513, "auxiliary_loss_mlp": 0.0103346, "balance_loss_clip": 1.05036104, "balance_loss_mlp": 1.02601564, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.8008779568146736, "language_loss": 0.81114495, "learning_rate": 1.1479057798741947e-06, "loss": 0.83304471, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 2.4583325386047363 }, { "auxiliary_loss_clip": 0.01034833, "auxiliary_loss_mlp": 0.01007086, "balance_loss_clip": 1.01075816, "balance_loss_mlp": 1.00601327, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.7860396829514378, "language_loss": 0.53366607, "learning_rate": 1.14720110811169e-06, "loss": 0.55408525, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 3.122953414916992 }, { "auxiliary_loss_clip": 0.01160988, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.05307531, "balance_loss_mlp": 1.01900434, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 2.2926688040353382, "language_loss": 0.76902199, "learning_rate": 1.146496565718098e-06, "loss": 0.79089868, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 2.5184547901153564 }, { "auxiliary_loss_clip": 0.01139415, "auxiliary_loss_mlp": 0.01027309, "balance_loss_clip": 1.04830551, "balance_loss_mlp": 1.01961088, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 1.9195947343853785, "language_loss": 0.75613767, "learning_rate": 1.1457921528002996e-06, "loss": 0.77780485, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 3.29231858253479 }, { "auxiliary_loss_clip": 0.01169698, "auxiliary_loss_mlp": 0.0076143, "balance_loss_clip": 1.0506258, "balance_loss_mlp": 1.00020623, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 6.078759950101584, "language_loss": 0.72211665, "learning_rate": 1.1450878694651522e-06, "loss": 0.7414279, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.5563583374023438 }, { "auxiliary_loss_clip": 0.0111162, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.04125071, "balance_loss_mlp": 1.02232468, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 2.760350890286075, "language_loss": 0.63058913, "learning_rate": 1.1443837158194954e-06, "loss": 0.651999, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 3.373089075088501 }, { "auxiliary_loss_clip": 0.01128283, "auxiliary_loss_mlp": 0.01035143, "balance_loss_clip": 1.04861832, "balance_loss_mlp": 1.02765083, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.7389972844986408, "language_loss": 0.74407196, "learning_rate": 1.1436796919701484e-06, "loss": 0.76570618, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 3.258052349090576 }, { "auxiliary_loss_clip": 0.01139662, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.04817224, "balance_loss_mlp": 1.02555394, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 1.7637940008279633, "language_loss": 0.6185689, "learning_rate": 1.1429757980239115e-06, "loss": 0.64029282, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.5779592990875244 }, { "auxiliary_loss_clip": 0.01170673, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.04985023, "balance_loss_mlp": 1.02125919, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 2.468568569870674, "language_loss": 0.81537086, "learning_rate": 1.1422720340875636e-06, "loss": 0.83737129, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.4728310108184814 }, { "auxiliary_loss_clip": 0.01159162, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.04809153, "balance_loss_mlp": 1.02647817, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 2.140041212003667, "language_loss": 0.79481304, "learning_rate": 1.1415684002678671e-06, "loss": 0.81674957, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 2.4942705631256104 }, { "auxiliary_loss_clip": 0.01143053, "auxiliary_loss_mlp": 0.01026741, "balance_loss_clip": 1.0451498, "balance_loss_mlp": 1.01829755, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 1.9603442988834594, "language_loss": 0.7765432, "learning_rate": 1.1408648966715617e-06, "loss": 0.79824114, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.5174977779388428 }, { "auxiliary_loss_clip": 0.01139841, "auxiliary_loss_mlp": 0.0102888, "balance_loss_clip": 1.04374146, "balance_loss_mlp": 1.02083921, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 1.8823858862340872, "language_loss": 0.72615111, "learning_rate": 1.1401615234053683e-06, "loss": 0.74783826, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.5169951915740967 }, { "auxiliary_loss_clip": 0.01143244, "auxiliary_loss_mlp": 0.01024849, "balance_loss_clip": 1.04653502, "balance_loss_mlp": 1.01754737, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 2.1650782425964494, "language_loss": 0.75934827, "learning_rate": 1.1394582805759885e-06, "loss": 0.78102911, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.522138833999634 }, { "auxiliary_loss_clip": 0.0115476, "auxiliary_loss_mlp": 0.01026721, "balance_loss_clip": 1.04967237, "balance_loss_mlp": 1.01869273, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 1.688368495305262, "language_loss": 0.75497508, "learning_rate": 1.1387551682901022e-06, "loss": 0.77678984, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.500398635864258 }, { "auxiliary_loss_clip": 0.01120939, "auxiliary_loss_mlp": 0.01027841, "balance_loss_clip": 1.04183626, "balance_loss_mlp": 1.02014041, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 1.87369360923161, "language_loss": 0.70748276, "learning_rate": 1.138052186654373e-06, "loss": 0.72897059, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.529946804046631 }, { "auxiliary_loss_clip": 0.01143561, "auxiliary_loss_mlp": 0.01031055, "balance_loss_clip": 1.04792464, "balance_loss_mlp": 1.02350664, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 2.3213033898250934, "language_loss": 0.87857103, "learning_rate": 1.1373493357754417e-06, "loss": 0.90031719, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 2.5116307735443115 }, { "auxiliary_loss_clip": 0.01167631, "auxiliary_loss_mlp": 0.01025377, "balance_loss_clip": 1.04855597, "balance_loss_mlp": 1.01847494, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 2.1333095800533988, "language_loss": 0.77011192, "learning_rate": 1.1366466157599303e-06, "loss": 0.79204202, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 2.491112232208252 }, { "auxiliary_loss_clip": 0.01107497, "auxiliary_loss_mlp": 0.00760976, "balance_loss_clip": 1.04170346, "balance_loss_mlp": 1.00025845, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 2.2298835995679043, "language_loss": 0.76055115, "learning_rate": 1.1359440267144412e-06, "loss": 0.7792359, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.652953624725342 }, { "auxiliary_loss_clip": 0.01155448, "auxiliary_loss_mlp": 0.01033553, "balance_loss_clip": 1.04789424, "balance_loss_mlp": 1.02664232, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 1.9587974030745092, "language_loss": 0.74171352, "learning_rate": 1.1352415687455556e-06, "loss": 0.76360351, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.643061637878418 }, { "auxiliary_loss_clip": 0.01156006, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.05111766, "balance_loss_mlp": 1.02173424, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 2.2817222488333306, "language_loss": 0.63780856, "learning_rate": 1.1345392419598362e-06, "loss": 0.65966022, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.5335605144500732 }, { "auxiliary_loss_clip": 0.0114947, "auxiliary_loss_mlp": 0.01026864, "balance_loss_clip": 1.04698825, "balance_loss_mlp": 1.01926184, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 1.5162610583897513, "language_loss": 0.7174319, "learning_rate": 1.1338370464638263e-06, "loss": 0.73919523, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 3.2798705101013184 }, { "auxiliary_loss_clip": 0.01167552, "auxiliary_loss_mlp": 0.01023884, "balance_loss_clip": 1.04757571, "balance_loss_mlp": 1.01657355, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.003982115099924, "language_loss": 0.63702309, "learning_rate": 1.1331349823640474e-06, "loss": 0.65893745, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.5335097312927246 }, { "auxiliary_loss_clip": 0.01153997, "auxiliary_loss_mlp": 0.00760446, "balance_loss_clip": 1.04788756, "balance_loss_mlp": 1.0002594, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 2.065219746491407, "language_loss": 0.77803862, "learning_rate": 1.132433049767003e-06, "loss": 0.79718304, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.5605647563934326 }, { "auxiliary_loss_clip": 0.01140338, "auxiliary_loss_mlp": 0.01027394, "balance_loss_clip": 1.04840446, "balance_loss_mlp": 1.02054799, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 1.7827630882289778, "language_loss": 0.80858266, "learning_rate": 1.1317312487791748e-06, "loss": 0.83025998, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.5478522777557373 }, { "auxiliary_loss_clip": 0.01148374, "auxiliary_loss_mlp": 0.01026691, "balance_loss_clip": 1.04568648, "balance_loss_mlp": 1.01904392, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 2.010792245561881, "language_loss": 0.73236609, "learning_rate": 1.1310295795070253e-06, "loss": 0.75411677, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.476300001144409 }, { "auxiliary_loss_clip": 0.01115559, "auxiliary_loss_mlp": 0.01033477, "balance_loss_clip": 1.04492652, "balance_loss_mlp": 1.02648878, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 1.7651852704788737, "language_loss": 0.80499816, "learning_rate": 1.1303280420569982e-06, "loss": 0.82648849, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.6357927322387695 }, { "auxiliary_loss_clip": 0.01148789, "auxiliary_loss_mlp": 0.01028934, "balance_loss_clip": 1.04653907, "balance_loss_mlp": 1.02183855, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 1.6062057564808938, "language_loss": 0.77092338, "learning_rate": 1.1296266365355158e-06, "loss": 0.79270059, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 2.581784963607788 }, { "auxiliary_loss_clip": 0.01134941, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.04998517, "balance_loss_mlp": 1.02166033, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 1.9048859174638555, "language_loss": 0.74205482, "learning_rate": 1.1289253630489806e-06, "loss": 0.76370239, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 2.651430368423462 }, { "auxiliary_loss_clip": 0.01160215, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.04779172, "balance_loss_mlp": 1.02292013, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 1.9277055197498014, "language_loss": 0.72163033, "learning_rate": 1.1282242217037753e-06, "loss": 0.74354064, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 3.2570526599884033 }, { "auxiliary_loss_clip": 0.0110901, "auxiliary_loss_mlp": 0.01032171, "balance_loss_clip": 1.04170024, "balance_loss_mlp": 1.02380252, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 2.694827500064338, "language_loss": 0.62048841, "learning_rate": 1.127523212606262e-06, "loss": 0.64190018, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 2.8150174617767334 }, { "auxiliary_loss_clip": 0.01153988, "auxiliary_loss_mlp": 0.01031856, "balance_loss_clip": 1.04760766, "balance_loss_mlp": 1.02466822, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 1.567245481564848, "language_loss": 0.73010802, "learning_rate": 1.1268223358627835e-06, "loss": 0.75196648, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.5406782627105713 }, { "auxiliary_loss_clip": 0.01169415, "auxiliary_loss_mlp": 0.01026769, "balance_loss_clip": 1.04970324, "balance_loss_mlp": 1.01906836, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 1.7671875198862104, "language_loss": 0.71524739, "learning_rate": 1.126121591579663e-06, "loss": 0.7372092, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 3.2688372135162354 }, { "auxiliary_loss_clip": 0.01152696, "auxiliary_loss_mlp": 0.01024446, "balance_loss_clip": 1.04932737, "balance_loss_mlp": 1.0173974, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.5705647188111869, "language_loss": 0.6875428, "learning_rate": 1.1254209798632018e-06, "loss": 0.70931417, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 3.2837698459625244 }, { "auxiliary_loss_clip": 0.01086055, "auxiliary_loss_mlp": 0.01030177, "balance_loss_clip": 1.03774285, "balance_loss_mlp": 1.02331638, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 1.7924037191174962, "language_loss": 0.84595871, "learning_rate": 1.124720500819683e-06, "loss": 0.8671211, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.6200053691864014 }, { "auxiliary_loss_clip": 0.01171968, "auxiliary_loss_mlp": 0.01030582, "balance_loss_clip": 1.05258346, "balance_loss_mlp": 1.02263689, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 1.727811063686695, "language_loss": 0.8218323, "learning_rate": 1.1240201545553682e-06, "loss": 0.84385788, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 2.4540579319000244 }, { "auxiliary_loss_clip": 0.01125208, "auxiliary_loss_mlp": 0.01029786, "balance_loss_clip": 1.0449276, "balance_loss_mlp": 1.02231145, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 9.527498504966, "language_loss": 0.72851151, "learning_rate": 1.1233199411764987e-06, "loss": 0.75006151, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.5746569633483887 }, { "auxiliary_loss_clip": 0.01114819, "auxiliary_loss_mlp": 0.01021628, "balance_loss_clip": 1.04276121, "balance_loss_mlp": 1.01424587, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 1.7209953386803591, "language_loss": 0.68918955, "learning_rate": 1.1226198607892978e-06, "loss": 0.710554, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.567357063293457 }, { "auxiliary_loss_clip": 0.01116992, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 1.04650557, "balance_loss_mlp": 1.01950812, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 2.066297570341196, "language_loss": 0.79976583, "learning_rate": 1.1219199134999664e-06, "loss": 0.82120299, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.5963332653045654 }, { "auxiliary_loss_clip": 0.01137605, "auxiliary_loss_mlp": 0.0102835, "balance_loss_clip": 1.04673088, "balance_loss_mlp": 1.02061927, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 4.870540980115804, "language_loss": 0.78874111, "learning_rate": 1.1212200994146863e-06, "loss": 0.81040072, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.510100841522217 }, { "auxiliary_loss_clip": 0.01120896, "auxiliary_loss_mlp": 0.01025758, "balance_loss_clip": 1.03885198, "balance_loss_mlp": 1.01792049, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 1.716750942511835, "language_loss": 0.75635278, "learning_rate": 1.120520418639618e-06, "loss": 0.77781934, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.5658695697784424 }, { "auxiliary_loss_clip": 0.01155711, "auxiliary_loss_mlp": 0.01026105, "balance_loss_clip": 1.05017447, "balance_loss_mlp": 1.0190239, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 2.4190950596145067, "language_loss": 0.83451569, "learning_rate": 1.119820871280903e-06, "loss": 0.85633379, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.581064462661743 }, { "auxiliary_loss_clip": 0.01154464, "auxiliary_loss_mlp": 0.01027794, "balance_loss_clip": 1.04772854, "balance_loss_mlp": 1.02018261, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 1.935642870233234, "language_loss": 0.73469055, "learning_rate": 1.1191214574446614e-06, "loss": 0.75651312, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 2.5680932998657227 }, { "auxiliary_loss_clip": 0.01136118, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.04620004, "balance_loss_mlp": 1.02102351, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 1.5937095356830602, "language_loss": 0.80105704, "learning_rate": 1.118422177236995e-06, "loss": 0.82270074, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 2.593764066696167 }, { "auxiliary_loss_clip": 0.01140722, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.04595661, "balance_loss_mlp": 1.0174278, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 2.3005430970029312, "language_loss": 0.85712254, "learning_rate": 1.1177230307639835e-06, "loss": 0.87878454, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 2.5446839332580566 }, { "auxiliary_loss_clip": 0.01121491, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.04395103, "balance_loss_mlp": 1.02365828, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 1.6508019213303118, "language_loss": 0.78711754, "learning_rate": 1.1170240181316865e-06, "loss": 0.8086412, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.607171058654785 }, { "auxiliary_loss_clip": 0.0111954, "auxiliary_loss_mlp": 0.01028517, "balance_loss_clip": 1.04087067, "balance_loss_mlp": 1.02064323, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 2.0198146436567277, "language_loss": 0.7964766, "learning_rate": 1.1163251394461442e-06, "loss": 0.81795728, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.5665955543518066 }, { "auxiliary_loss_clip": 0.01152936, "auxiliary_loss_mlp": 0.01026815, "balance_loss_clip": 1.04834151, "balance_loss_mlp": 1.01862788, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 1.81068764104059, "language_loss": 0.82065606, "learning_rate": 1.1156263948133746e-06, "loss": 0.8424536, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 3.52496337890625 }, { "auxiliary_loss_clip": 0.01106005, "auxiliary_loss_mlp": 0.00761399, "balance_loss_clip": 1.04359102, "balance_loss_mlp": 1.00023341, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 1.593804037533109, "language_loss": 0.77661335, "learning_rate": 1.1149277843393787e-06, "loss": 0.79528743, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.6693572998046875 }, { "auxiliary_loss_clip": 0.01090887, "auxiliary_loss_mlp": 0.00761061, "balance_loss_clip": 1.03721797, "balance_loss_mlp": 1.00020564, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 2.6396831404628385, "language_loss": 0.63463479, "learning_rate": 1.1142293081301342e-06, "loss": 0.65315425, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.6189284324645996 }, { "auxiliary_loss_clip": 0.01135655, "auxiliary_loss_mlp": 0.01026732, "balance_loss_clip": 1.04522777, "balance_loss_mlp": 1.02011836, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.5642342096534616, "language_loss": 0.67855269, "learning_rate": 1.1135309662915995e-06, "loss": 0.70017654, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 2.554874897003174 }, { "auxiliary_loss_clip": 0.01115321, "auxiliary_loss_mlp": 0.01028641, "balance_loss_clip": 1.04260707, "balance_loss_mlp": 1.02133381, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 1.8994390730297561, "language_loss": 0.60381019, "learning_rate": 1.112832758929712e-06, "loss": 0.62524986, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.6939549446105957 }, { "auxiliary_loss_clip": 0.01151627, "auxiliary_loss_mlp": 0.010263, "balance_loss_clip": 1.04787576, "balance_loss_mlp": 1.01877809, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 1.819818643638633, "language_loss": 0.7472226, "learning_rate": 1.11213468615039e-06, "loss": 0.76900184, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.528226375579834 }, { "auxiliary_loss_clip": 0.01092843, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 1.04146051, "balance_loss_mlp": 1.02217174, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 1.9163785361743049, "language_loss": 0.75357556, "learning_rate": 1.1114367480595292e-06, "loss": 0.77479899, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 2.663393974304199 }, { "auxiliary_loss_clip": 0.01094516, "auxiliary_loss_mlp": 0.01026922, "balance_loss_clip": 1.04042733, "balance_loss_mlp": 1.01851153, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 1.8865834511206034, "language_loss": 0.81377321, "learning_rate": 1.1107389447630086e-06, "loss": 0.83498758, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 2.600276470184326 }, { "auxiliary_loss_clip": 0.01134162, "auxiliary_loss_mlp": 0.00761249, "balance_loss_clip": 1.04283714, "balance_loss_mlp": 1.00027466, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 2.0144183661632904, "language_loss": 0.78312302, "learning_rate": 1.1100412763666818e-06, "loss": 0.80207717, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 3.35496187210083 }, { "auxiliary_loss_clip": 0.01145732, "auxiliary_loss_mlp": 0.01027043, "balance_loss_clip": 1.04922211, "balance_loss_mlp": 1.0198164, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 1.3931207845913098, "language_loss": 0.79986113, "learning_rate": 1.1093437429763865e-06, "loss": 0.82158887, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 2.574359655380249 }, { "auxiliary_loss_clip": 0.01154368, "auxiliary_loss_mlp": 0.01025516, "balance_loss_clip": 1.0480845, "balance_loss_mlp": 1.01878953, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 3.984499960184166, "language_loss": 0.73648316, "learning_rate": 1.1086463446979361e-06, "loss": 0.75828201, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 2.484447956085205 }, { "auxiliary_loss_clip": 0.01160195, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.0531857, "balance_loss_mlp": 1.02185035, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 1.8967664563748463, "language_loss": 0.77706891, "learning_rate": 1.1079490816371277e-06, "loss": 0.79896641, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 4.071303844451904 }, { "auxiliary_loss_clip": 0.01153395, "auxiliary_loss_mlp": 0.00761596, "balance_loss_clip": 1.04656589, "balance_loss_mlp": 1.0002358, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 2.2430748855138205, "language_loss": 0.74732447, "learning_rate": 1.1072519538997352e-06, "loss": 0.76647437, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.511948823928833 }, { "auxiliary_loss_clip": 0.01139892, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.04393494, "balance_loss_mlp": 1.01784444, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 1.6127733408443248, "language_loss": 0.82329786, "learning_rate": 1.1065549615915095e-06, "loss": 0.84495223, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.538252115249634 }, { "auxiliary_loss_clip": 0.01155381, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.05169392, "balance_loss_mlp": 1.02131772, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 2.347671575462026, "language_loss": 0.78691113, "learning_rate": 1.105858104818187e-06, "loss": 0.80875361, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.5723114013671875 }, { "auxiliary_loss_clip": 0.0116008, "auxiliary_loss_mlp": 0.01030076, "balance_loss_clip": 1.0501225, "balance_loss_mlp": 1.02244616, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 3.5841497416169923, "language_loss": 0.74833161, "learning_rate": 1.105161383685478e-06, "loss": 0.77023315, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 2.4700145721435547 }, { "auxiliary_loss_clip": 0.01028577, "auxiliary_loss_mlp": 0.01003436, "balance_loss_clip": 1.01329851, "balance_loss_mlp": 1.00232184, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.7354307146301086, "language_loss": 0.56343257, "learning_rate": 1.1044647982990771e-06, "loss": 0.58375275, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.1064419746398926 }, { "auxiliary_loss_clip": 0.01144448, "auxiliary_loss_mlp": 0.01029261, "balance_loss_clip": 1.05015469, "balance_loss_mlp": 1.02149796, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 2.2579426366155144, "language_loss": 0.64310187, "learning_rate": 1.1037683487646536e-06, "loss": 0.66483897, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.6103577613830566 }, { "auxiliary_loss_clip": 0.01140627, "auxiliary_loss_mlp": 0.00760739, "balance_loss_clip": 1.05014277, "balance_loss_mlp": 1.00023413, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 2.257420017352675, "language_loss": 0.77272469, "learning_rate": 1.1030720351878583e-06, "loss": 0.79173833, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.5073328018188477 }, { "auxiliary_loss_clip": 0.01043743, "auxiliary_loss_mlp": 0.01001973, "balance_loss_clip": 1.01486254, "balance_loss_mlp": 1.00071514, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.80673535201615, "language_loss": 0.57681525, "learning_rate": 1.102375857674323e-06, "loss": 0.5972724, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.1118485927581787 }, { "auxiliary_loss_clip": 0.01137635, "auxiliary_loss_mlp": 0.01025045, "balance_loss_clip": 1.04238844, "balance_loss_mlp": 1.01817203, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 2.1183017210842854, "language_loss": 0.90540743, "learning_rate": 1.1016798163296561e-06, "loss": 0.9270342, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.5368309020996094 }, { "auxiliary_loss_clip": 0.01154024, "auxiliary_loss_mlp": 0.01033565, "balance_loss_clip": 1.04910254, "balance_loss_mlp": 1.02613211, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 1.9692509847570074, "language_loss": 0.66329944, "learning_rate": 1.1009839112594471e-06, "loss": 0.6851753, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.486555576324463 }, { "auxiliary_loss_clip": 0.01157369, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.04892313, "balance_loss_mlp": 1.02274275, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 2.3429498123510197, "language_loss": 0.71643865, "learning_rate": 1.1002881425692638e-06, "loss": 0.73830879, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 2.529355049133301 }, { "auxiliary_loss_clip": 0.01147378, "auxiliary_loss_mlp": 0.0102554, "balance_loss_clip": 1.04474592, "balance_loss_mlp": 1.01773763, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 1.6230101669003636, "language_loss": 0.75278509, "learning_rate": 1.0995925103646532e-06, "loss": 0.77451432, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 2.515502691268921 }, { "auxiliary_loss_clip": 0.01124454, "auxiliary_loss_mlp": 0.01024281, "balance_loss_clip": 1.04895043, "balance_loss_mlp": 1.01713181, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.4859234464756599, "language_loss": 0.6662966, "learning_rate": 1.0988970147511437e-06, "loss": 0.68778396, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.6869187355041504 }, { "auxiliary_loss_clip": 0.01140094, "auxiliary_loss_mlp": 0.01025743, "balance_loss_clip": 1.04891944, "balance_loss_mlp": 1.01834583, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 2.1569933747270658, "language_loss": 0.80189937, "learning_rate": 1.0982016558342405e-06, "loss": 0.82355773, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.5434372425079346 }, { "auxiliary_loss_clip": 0.01170963, "auxiliary_loss_mlp": 0.01025558, "balance_loss_clip": 1.05180871, "balance_loss_mlp": 1.01866436, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 2.0389595193234387, "language_loss": 0.71062863, "learning_rate": 1.0975064337194291e-06, "loss": 0.73259377, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 3.2565596103668213 }, { "auxiliary_loss_clip": 0.01117643, "auxiliary_loss_mlp": 0.01024576, "balance_loss_clip": 1.04235244, "balance_loss_mlp": 1.01713133, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 1.3851498068770394, "language_loss": 0.70296061, "learning_rate": 1.0968113485121743e-06, "loss": 0.72438276, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.5696792602539062 }, { "auxiliary_loss_clip": 0.01153482, "auxiliary_loss_mlp": 0.00761081, "balance_loss_clip": 1.04522395, "balance_loss_mlp": 1.00023901, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 1.7493107588810082, "language_loss": 0.79910505, "learning_rate": 1.0961164003179185e-06, "loss": 0.81825078, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.5159542560577393 }, { "auxiliary_loss_clip": 0.01123016, "auxiliary_loss_mlp": 0.01024193, "balance_loss_clip": 1.04378653, "balance_loss_mlp": 1.01719499, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 1.7849862634196725, "language_loss": 0.83800447, "learning_rate": 1.0954215892420884e-06, "loss": 0.85947657, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.572155237197876 }, { "auxiliary_loss_clip": 0.01131713, "auxiliary_loss_mlp": 0.01030527, "balance_loss_clip": 1.04833353, "balance_loss_mlp": 1.02312386, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.6634703109275866, "language_loss": 0.70291805, "learning_rate": 1.094726915390082e-06, "loss": 0.72454047, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.5478005409240723 }, { "auxiliary_loss_clip": 0.01154352, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.0488323, "balance_loss_mlp": 1.02267766, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 1.7971623772562064, "language_loss": 0.69608319, "learning_rate": 1.0940323788672836e-06, "loss": 0.71792746, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.520022392272949 }, { "auxiliary_loss_clip": 0.01148238, "auxiliary_loss_mlp": 0.0102621, "balance_loss_clip": 1.04734778, "balance_loss_mlp": 1.01920962, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 2.165453340651022, "language_loss": 0.73541296, "learning_rate": 1.0933379797790522e-06, "loss": 0.75715744, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 2.514840841293335 }, { "auxiliary_loss_clip": 0.01168169, "auxiliary_loss_mlp": 0.01029778, "balance_loss_clip": 1.0505718, "balance_loss_mlp": 1.02244675, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 3.1648905207737466, "language_loss": 0.71749568, "learning_rate": 1.0926437182307293e-06, "loss": 0.73947525, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 3.265697479248047 }, { "auxiliary_loss_clip": 0.01145126, "auxiliary_loss_mlp": 0.01026441, "balance_loss_clip": 1.04649067, "balance_loss_mlp": 1.01954174, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 1.7439931305005685, "language_loss": 0.78564936, "learning_rate": 1.0919495943276338e-06, "loss": 0.807365, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 2.5579118728637695 }, { "auxiliary_loss_clip": 0.01125637, "auxiliary_loss_mlp": 0.01026317, "balance_loss_clip": 1.04013062, "balance_loss_mlp": 1.01882493, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 2.2680750302643675, "language_loss": 0.76303947, "learning_rate": 1.0912556081750611e-06, "loss": 0.78455907, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 2.514146327972412 }, { "auxiliary_loss_clip": 0.01138998, "auxiliary_loss_mlp": 0.01024587, "balance_loss_clip": 1.04865861, "balance_loss_mlp": 1.01760173, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 2.2928014241872643, "language_loss": 0.76386249, "learning_rate": 1.0905617598782909e-06, "loss": 0.78549838, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 2.547835111618042 }, { "auxiliary_loss_clip": 0.01103215, "auxiliary_loss_mlp": 0.01026684, "balance_loss_clip": 1.04092574, "balance_loss_mlp": 1.01934385, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 2.158673180055542, "language_loss": 0.81100333, "learning_rate": 1.0898680495425775e-06, "loss": 0.83230233, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 3.3456761837005615 }, { "auxiliary_loss_clip": 0.01141316, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.04704785, "balance_loss_mlp": 1.02192342, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 1.6444940842124636, "language_loss": 0.80337656, "learning_rate": 1.0891744772731594e-06, "loss": 0.82508326, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.494062900543213 }, { "auxiliary_loss_clip": 0.01154229, "auxiliary_loss_mlp": 0.01029667, "balance_loss_clip": 1.04695964, "balance_loss_mlp": 1.02226448, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 2.3638076316297494, "language_loss": 0.65805721, "learning_rate": 1.088481043175248e-06, "loss": 0.67989612, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.5377020835876465 }, { "auxiliary_loss_clip": 0.01130318, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.04349136, "balance_loss_mlp": 1.02036619, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.668976575413938, "language_loss": 0.75558031, "learning_rate": 1.0877877473540368e-06, "loss": 0.77716005, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 2.578437328338623 }, { "auxiliary_loss_clip": 0.01168852, "auxiliary_loss_mlp": 0.01030795, "balance_loss_clip": 1.04905868, "balance_loss_mlp": 1.0236659, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 1.807928516946348, "language_loss": 0.72489107, "learning_rate": 1.0870945899147002e-06, "loss": 0.74688751, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.4338152408599854 }, { "auxiliary_loss_clip": 0.01153463, "auxiliary_loss_mlp": 0.01029353, "balance_loss_clip": 1.04943109, "balance_loss_mlp": 1.02202439, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 1.939863583808289, "language_loss": 0.76043272, "learning_rate": 1.0864015709623879e-06, "loss": 0.78226089, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.5715439319610596 }, { "auxiliary_loss_clip": 0.0116034, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.05044353, "balance_loss_mlp": 1.02279162, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.1650330374480795, "language_loss": 0.79818875, "learning_rate": 1.0857086906022313e-06, "loss": 0.8200953, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.552996873855591 }, { "auxiliary_loss_clip": 0.01087141, "auxiliary_loss_mlp": 0.01026437, "balance_loss_clip": 1.04243755, "balance_loss_mlp": 1.01879859, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 2.0613688894625057, "language_loss": 0.73479098, "learning_rate": 1.0850159489393388e-06, "loss": 0.75592679, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.6428065299987793 }, { "auxiliary_loss_clip": 0.01118949, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.04063892, "balance_loss_mlp": 1.01957202, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 1.7808163314410936, "language_loss": 0.8219468, "learning_rate": 1.0843233460787992e-06, "loss": 0.84341192, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.52597975730896 }, { "auxiliary_loss_clip": 0.01112492, "auxiliary_loss_mlp": 0.01029787, "balance_loss_clip": 1.04332244, "balance_loss_mlp": 1.022488, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 1.933018497589596, "language_loss": 0.77505088, "learning_rate": 1.0836308821256805e-06, "loss": 0.79647362, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.6026968955993652 }, { "auxiliary_loss_clip": 0.01154463, "auxiliary_loss_mlp": 0.01024677, "balance_loss_clip": 1.04975986, "balance_loss_mlp": 1.01818943, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 2.032497809118727, "language_loss": 0.77861512, "learning_rate": 1.0829385571850282e-06, "loss": 0.80040658, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.4761977195739746 }, { "auxiliary_loss_clip": 0.01174588, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.05231977, "balance_loss_mlp": 1.02336502, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 2.314740657880902, "language_loss": 0.8353042, "learning_rate": 1.0822463713618679e-06, "loss": 0.85736597, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.4326953887939453 }, { "auxiliary_loss_clip": 0.0112922, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.04468369, "balance_loss_mlp": 1.02280498, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 2.049302002158593, "language_loss": 0.84745061, "learning_rate": 1.0815543247612034e-06, "loss": 0.86904699, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.528991222381592 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.01025628, "balance_loss_clip": 1.04203105, "balance_loss_mlp": 1.01841617, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 1.5296939165763543, "language_loss": 0.82940531, "learning_rate": 1.0808624174880168e-06, "loss": 0.85103613, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.537257432937622 }, { "auxiliary_loss_clip": 0.01164657, "auxiliary_loss_mlp": 0.01023131, "balance_loss_clip": 1.04854012, "balance_loss_mlp": 1.01627612, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 1.8607091536186615, "language_loss": 0.79889202, "learning_rate": 1.080170649647272e-06, "loss": 0.82076991, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.500964879989624 }, { "auxiliary_loss_clip": 0.01165253, "auxiliary_loss_mlp": 0.01029622, "balance_loss_clip": 1.04846716, "balance_loss_mlp": 1.02227271, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 1.6104067821836114, "language_loss": 0.67571723, "learning_rate": 1.0794790213439068e-06, "loss": 0.69766599, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 3.410719871520996 }, { "auxiliary_loss_clip": 0.01114764, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.04362428, "balance_loss_mlp": 1.02057886, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 2.7432983403994378, "language_loss": 0.78772312, "learning_rate": 1.078787532682843e-06, "loss": 0.80915105, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 2.5909414291381836 }, { "auxiliary_loss_clip": 0.01151746, "auxiliary_loss_mlp": 0.01024193, "balance_loss_clip": 1.04846525, "balance_loss_mlp": 1.01706421, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 2.1596213229518515, "language_loss": 0.75843787, "learning_rate": 1.0780961837689773e-06, "loss": 0.78019726, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.60988450050354 }, { "auxiliary_loss_clip": 0.01133478, "auxiliary_loss_mlp": 0.0102851, "balance_loss_clip": 1.04829741, "balance_loss_mlp": 1.02123809, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 1.8643411618227863, "language_loss": 0.69985747, "learning_rate": 1.0774049747071883e-06, "loss": 0.72147733, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.50639271736145 }, { "auxiliary_loss_clip": 0.01113779, "auxiliary_loss_mlp": 0.01021988, "balance_loss_clip": 1.04723322, "balance_loss_mlp": 1.01464462, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 4.165876233833878, "language_loss": 0.68201339, "learning_rate": 1.076713905602332e-06, "loss": 0.70337105, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.705028533935547 }, { "auxiliary_loss_clip": 0.01160533, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.05137312, "balance_loss_mlp": 1.02443373, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 1.767234490246254, "language_loss": 0.81218648, "learning_rate": 1.07602297655924e-06, "loss": 0.83411199, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.484509229660034 }, { "auxiliary_loss_clip": 0.01169995, "auxiliary_loss_mlp": 0.01023097, "balance_loss_clip": 1.05278468, "balance_loss_mlp": 1.01617074, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 1.6742899219487943, "language_loss": 0.8105191, "learning_rate": 1.0753321876827292e-06, "loss": 0.83244991, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.465320348739624 }, { "auxiliary_loss_clip": 0.0116881, "auxiliary_loss_mlp": 0.01028755, "balance_loss_clip": 1.04975533, "balance_loss_mlp": 1.02153695, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 3.2276838761070024, "language_loss": 0.73799723, "learning_rate": 1.0746415390775893e-06, "loss": 0.75997287, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 3.277190923690796 }, { "auxiliary_loss_clip": 0.01167921, "auxiliary_loss_mlp": 0.01031745, "balance_loss_clip": 1.05128968, "balance_loss_mlp": 1.02448535, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 2.4462773832948197, "language_loss": 0.7630775, "learning_rate": 1.0739510308485939e-06, "loss": 0.78507423, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 2.4362287521362305 }, { "auxiliary_loss_clip": 0.01031524, "auxiliary_loss_mlp": 0.01002315, "balance_loss_clip": 1.01451707, "balance_loss_mlp": 1.00111103, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.8112241909036088, "language_loss": 0.62560725, "learning_rate": 1.07326066310049e-06, "loss": 0.64594567, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 3.171503782272339 }, { "auxiliary_loss_clip": 0.0112347, "auxiliary_loss_mlp": 0.01030896, "balance_loss_clip": 1.04358244, "balance_loss_mlp": 1.02272391, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 1.78182835288331, "language_loss": 0.79073703, "learning_rate": 1.0725704359380059e-06, "loss": 0.81228065, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.6341795921325684 }, { "auxiliary_loss_clip": 0.01168623, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.05006671, "balance_loss_mlp": 1.02466583, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 1.8415030156218595, "language_loss": 0.7208572, "learning_rate": 1.0718803494658497e-06, "loss": 0.74286306, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 3.151150941848755 }, { "auxiliary_loss_clip": 0.01059058, "auxiliary_loss_mlp": 0.01030775, "balance_loss_clip": 1.0361861, "balance_loss_mlp": 1.02331877, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 2.0456372258305198, "language_loss": 0.84107578, "learning_rate": 1.071190403788707e-06, "loss": 0.86197412, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 3.5614545345306396 }, { "auxiliary_loss_clip": 0.01134931, "auxiliary_loss_mlp": 0.01027932, "balance_loss_clip": 1.04885173, "balance_loss_mlp": 1.02079189, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 1.962634767268292, "language_loss": 0.75613731, "learning_rate": 1.0705005990112415e-06, "loss": 0.77776593, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 2.8410847187042236 }, { "auxiliary_loss_clip": 0.01101889, "auxiliary_loss_mlp": 0.01025927, "balance_loss_clip": 1.04534936, "balance_loss_mlp": 1.01834559, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 4.6554250175982785, "language_loss": 0.74557507, "learning_rate": 1.0698109352380957e-06, "loss": 0.76685327, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.5994455814361572 }, { "auxiliary_loss_clip": 0.01165847, "auxiliary_loss_mlp": 0.01026272, "balance_loss_clip": 1.04871643, "balance_loss_mlp": 1.01907766, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 1.7386773514304639, "language_loss": 0.77790105, "learning_rate": 1.0691214125738909e-06, "loss": 0.79982221, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 2.5054798126220703 }, { "auxiliary_loss_clip": 0.01061387, "auxiliary_loss_mlp": 0.01002138, "balance_loss_clip": 1.013659, "balance_loss_mlp": 1.00094032, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.791641891176091, "language_loss": 0.57523292, "learning_rate": 1.0684320311232287e-06, "loss": 0.59586817, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.2169415950775146 }, { "auxiliary_loss_clip": 0.01135224, "auxiliary_loss_mlp": 0.01024632, "balance_loss_clip": 1.0454042, "balance_loss_mlp": 1.01684177, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 1.6500292091141693, "language_loss": 0.81289381, "learning_rate": 1.0677427909906865e-06, "loss": 0.83449233, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.6502134799957275 }, { "auxiliary_loss_clip": 0.01172388, "auxiliary_loss_mlp": 0.01030413, "balance_loss_clip": 1.0509243, "balance_loss_mlp": 1.02305245, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 1.928909200883684, "language_loss": 0.72334909, "learning_rate": 1.0670536922808216e-06, "loss": 0.74537706, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.565969228744507 }, { "auxiliary_loss_clip": 0.01142592, "auxiliary_loss_mlp": 0.01028578, "balance_loss_clip": 1.04839218, "balance_loss_mlp": 1.02149677, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 2.0330468833455746, "language_loss": 0.71671939, "learning_rate": 1.06636473509817e-06, "loss": 0.73843116, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.5809764862060547 }, { "auxiliary_loss_clip": 0.01137972, "auxiliary_loss_mlp": 0.00761254, "balance_loss_clip": 1.04583156, "balance_loss_mlp": 1.00023961, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 2.117131494032988, "language_loss": 0.80721557, "learning_rate": 1.0656759195472447e-06, "loss": 0.82620788, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.511514902114868 }, { "auxiliary_loss_clip": 0.01040683, "auxiliary_loss_mlp": 0.01002618, "balance_loss_clip": 1.01448774, "balance_loss_mlp": 1.00149715, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 0.8015071945037564, "language_loss": 0.59731883, "learning_rate": 1.0649872457325414e-06, "loss": 0.61775184, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.075063705444336 }, { "auxiliary_loss_clip": 0.01052046, "auxiliary_loss_mlp": 0.01001531, "balance_loss_clip": 1.01303077, "balance_loss_mlp": 1.00029755, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8483726450148077, "language_loss": 0.55121338, "learning_rate": 1.0642987137585278e-06, "loss": 0.57174921, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 3.0424134731292725 }, { "auxiliary_loss_clip": 0.01137593, "auxiliary_loss_mlp": 0.0102605, "balance_loss_clip": 1.04639876, "balance_loss_mlp": 1.01865935, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 1.6894615229284786, "language_loss": 0.82278341, "learning_rate": 1.0636103237296561e-06, "loss": 0.84441984, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.5111546516418457 }, { "auxiliary_loss_clip": 0.01154523, "auxiliary_loss_mlp": 0.01028138, "balance_loss_clip": 1.05258727, "balance_loss_mlp": 1.02159357, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 1.789725700514809, "language_loss": 0.84607613, "learning_rate": 1.062922075750353e-06, "loss": 0.86790276, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.519035816192627 }, { "auxiliary_loss_clip": 0.01128742, "auxiliary_loss_mlp": 0.01023192, "balance_loss_clip": 1.04626155, "balance_loss_mlp": 1.01642966, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 1.9207645013029482, "language_loss": 0.72056508, "learning_rate": 1.0622339699250267e-06, "loss": 0.74208444, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 3.3568038940429688 }, { "auxiliary_loss_clip": 0.01125732, "auxiliary_loss_mlp": 0.01027251, "balance_loss_clip": 1.04463577, "balance_loss_mlp": 1.02078438, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 2.3385865048339887, "language_loss": 0.79227579, "learning_rate": 1.0615460063580624e-06, "loss": 0.81380564, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.5803029537200928 }, { "auxiliary_loss_clip": 0.01143539, "auxiliary_loss_mlp": 0.01027818, "balance_loss_clip": 1.0485642, "balance_loss_mlp": 1.0210793, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 1.761456125878147, "language_loss": 0.72898674, "learning_rate": 1.060858185153821e-06, "loss": 0.75070035, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.500131607055664 }, { "auxiliary_loss_clip": 0.01144439, "auxiliary_loss_mlp": 0.01031035, "balance_loss_clip": 1.04721081, "balance_loss_mlp": 1.02286375, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 2.6321393073114376, "language_loss": 0.75560683, "learning_rate": 1.0601705064166474e-06, "loss": 0.77736157, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.520533800125122 }, { "auxiliary_loss_clip": 0.01133567, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.04562283, "balance_loss_mlp": 1.02150583, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 1.9766448351208366, "language_loss": 0.73434651, "learning_rate": 1.0594829702508596e-06, "loss": 0.75597334, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 2.5094213485717773 }, { "auxiliary_loss_clip": 0.01129497, "auxiliary_loss_mlp": 0.0102637, "balance_loss_clip": 1.04587245, "balance_loss_mlp": 1.01968837, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 1.6295548485481133, "language_loss": 0.55307102, "learning_rate": 1.0587955767607592e-06, "loss": 0.57462966, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.672281265258789 }, { "auxiliary_loss_clip": 0.01167411, "auxiliary_loss_mlp": 0.01023456, "balance_loss_clip": 1.04973793, "balance_loss_mlp": 1.01610649, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 2.3371178143932294, "language_loss": 0.77486169, "learning_rate": 1.0581083260506206e-06, "loss": 0.79677033, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.425196409225464 }, { "auxiliary_loss_clip": 0.01137748, "auxiliary_loss_mlp": 0.01025085, "balance_loss_clip": 1.04424655, "balance_loss_mlp": 1.01790833, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 2.1096521898504177, "language_loss": 0.76581764, "learning_rate": 1.0574212182246993e-06, "loss": 0.78744602, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 2.533027410507202 }, { "auxiliary_loss_clip": 0.01145583, "auxiliary_loss_mlp": 0.01027514, "balance_loss_clip": 1.04761624, "balance_loss_mlp": 1.0198642, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 2.490136574284313, "language_loss": 0.75864154, "learning_rate": 1.0567342533872303e-06, "loss": 0.7803725, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 2.5899250507354736 }, { "auxiliary_loss_clip": 0.01142512, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.04852867, "balance_loss_mlp": 1.02363825, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 1.603866791244231, "language_loss": 0.80763471, "learning_rate": 1.0560474316424255e-06, "loss": 0.82937217, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 3.3497142791748047 }, { "auxiliary_loss_clip": 0.01140372, "auxiliary_loss_mlp": 0.01025842, "balance_loss_clip": 1.04483199, "balance_loss_mlp": 1.01828444, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 2.1078059633374666, "language_loss": 0.73561567, "learning_rate": 1.0553607530944746e-06, "loss": 0.75727785, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 2.5736465454101562 }, { "auxiliary_loss_clip": 0.01126833, "auxiliary_loss_mlp": 0.01026327, "balance_loss_clip": 1.04361987, "balance_loss_mlp": 1.01900721, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 2.011932618377912, "language_loss": 0.89484924, "learning_rate": 1.0546742178475463e-06, "loss": 0.91638088, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.5795228481292725 }, { "auxiliary_loss_clip": 0.01116287, "auxiliary_loss_mlp": 0.01024101, "balance_loss_clip": 1.04724228, "balance_loss_mlp": 1.01746094, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 1.817005994615816, "language_loss": 0.86707288, "learning_rate": 1.0539878260057868e-06, "loss": 0.88847685, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 4.004964590072632 }, { "auxiliary_loss_clip": 0.01155837, "auxiliary_loss_mlp": 0.01031563, "balance_loss_clip": 1.05071735, "balance_loss_mlp": 1.02368307, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 2.403607840916262, "language_loss": 0.68259037, "learning_rate": 1.0533015776733226e-06, "loss": 0.70446438, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.450758695602417 }, { "auxiliary_loss_clip": 0.01136191, "auxiliary_loss_mlp": 0.01030155, "balance_loss_clip": 1.04714096, "balance_loss_mlp": 1.02224565, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 2.202187835943687, "language_loss": 0.78510875, "learning_rate": 1.0526154729542566e-06, "loss": 0.80677223, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.5340051651000977 }, { "auxiliary_loss_clip": 0.01128042, "auxiliary_loss_mlp": 0.01023529, "balance_loss_clip": 1.0485692, "balance_loss_mlp": 1.01639771, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 2.365660936632226, "language_loss": 0.80379283, "learning_rate": 1.0519295119526699e-06, "loss": 0.82530856, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.56020450592041 }, { "auxiliary_loss_clip": 0.01143908, "auxiliary_loss_mlp": 0.01023808, "balance_loss_clip": 1.04758704, "balance_loss_mlp": 1.01656914, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.5905477189425241, "language_loss": 0.82656413, "learning_rate": 1.0512436947726227e-06, "loss": 0.84824127, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 2.5629281997680664 }, { "auxiliary_loss_clip": 0.0112667, "auxiliary_loss_mlp": 0.01025164, "balance_loss_clip": 1.04469204, "balance_loss_mlp": 1.01786256, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 2.5903207221299533, "language_loss": 0.65251601, "learning_rate": 1.0505580215181517e-06, "loss": 0.67403436, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.5495095252990723 }, { "auxiliary_loss_clip": 0.01020605, "auxiliary_loss_mlp": 0.01004987, "balance_loss_clip": 1.01227486, "balance_loss_mlp": 1.00391376, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.7965459120548644, "language_loss": 0.56627905, "learning_rate": 1.0498724922932753e-06, "loss": 0.58653498, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.0812716484069824 }, { "auxiliary_loss_clip": 0.01174021, "auxiliary_loss_mlp": 0.01028143, "balance_loss_clip": 1.05307794, "balance_loss_mlp": 1.02000129, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 2.0352660915695853, "language_loss": 0.86421895, "learning_rate": 1.0491871072019851e-06, "loss": 0.8862406, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.4442546367645264 }, { "auxiliary_loss_clip": 0.01129697, "auxiliary_loss_mlp": 0.01029453, "balance_loss_clip": 1.04370177, "balance_loss_mlp": 1.02268767, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 1.639524166169494, "language_loss": 0.64358938, "learning_rate": 1.0485018663482555e-06, "loss": 0.66518092, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.6116418838500977 }, { "auxiliary_loss_clip": 0.0115045, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04860258, "balance_loss_mlp": 1.018857, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.461462032615497, "language_loss": 0.70598614, "learning_rate": 1.0478167698360354e-06, "loss": 0.72775996, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.5278279781341553 }, { "auxiliary_loss_clip": 0.0114518, "auxiliary_loss_mlp": 0.01029748, "balance_loss_clip": 1.04520202, "balance_loss_mlp": 1.02213073, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 2.021330036682852, "language_loss": 0.70470637, "learning_rate": 1.0471318177692556e-06, "loss": 0.72645569, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.5291197299957275 }, { "auxiliary_loss_clip": 0.01113417, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.04172218, "balance_loss_mlp": 1.01666713, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 2.2224188221020924, "language_loss": 0.75946176, "learning_rate": 1.046447010251821e-06, "loss": 0.78083169, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 2.604618549346924 }, { "auxiliary_loss_clip": 0.0114089, "auxiliary_loss_mlp": 0.01029535, "balance_loss_clip": 1.04927588, "balance_loss_mlp": 1.02283847, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 1.635632401923903, "language_loss": 0.75770414, "learning_rate": 1.0457623473876157e-06, "loss": 0.77940834, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.5775182247161865 }, { "auxiliary_loss_clip": 0.01165832, "auxiliary_loss_mlp": 0.01026843, "balance_loss_clip": 1.04904246, "balance_loss_mlp": 1.01992249, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 1.7753152940242976, "language_loss": 0.70947427, "learning_rate": 1.0450778292805046e-06, "loss": 0.73140103, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.49786114692688 }, { "auxiliary_loss_clip": 0.01157808, "auxiliary_loss_mlp": 0.01033439, "balance_loss_clip": 1.04860365, "balance_loss_mlp": 1.02635765, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 1.5127376380851054, "language_loss": 0.78669751, "learning_rate": 1.0443934560343267e-06, "loss": 0.80860996, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 3.342372179031372 }, { "auxiliary_loss_clip": 0.0111153, "auxiliary_loss_mlp": 0.01028714, "balance_loss_clip": 1.0415771, "balance_loss_mlp": 1.02182984, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 1.9350987321996647, "language_loss": 0.7811296, "learning_rate": 1.0437092277529034e-06, "loss": 0.80253208, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.5839102268218994 }, { "auxiliary_loss_clip": 0.01135455, "auxiliary_loss_mlp": 0.01032186, "balance_loss_clip": 1.04546261, "balance_loss_mlp": 1.02523923, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 2.274694452106577, "language_loss": 0.73593533, "learning_rate": 1.0430251445400292e-06, "loss": 0.75761175, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.4953529834747314 }, { "auxiliary_loss_clip": 0.01068526, "auxiliary_loss_mlp": 0.01026962, "balance_loss_clip": 1.04126334, "balance_loss_mlp": 1.02013087, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 2.0371121135412666, "language_loss": 0.62822533, "learning_rate": 1.0423412064994787e-06, "loss": 0.64918017, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 2.9091737270355225 }, { "auxiliary_loss_clip": 0.01126024, "auxiliary_loss_mlp": 0.01024362, "balance_loss_clip": 1.04286289, "balance_loss_mlp": 1.0178113, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 1.7925151324497715, "language_loss": 0.73881519, "learning_rate": 1.0416574137350064e-06, "loss": 0.76031911, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 2.8942360877990723 }, { "auxiliary_loss_clip": 0.01147847, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.04772437, "balance_loss_mlp": 1.02204311, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.326676562983499, "language_loss": 0.80631942, "learning_rate": 1.0409737663503428e-06, "loss": 0.82808995, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.5580177307128906 }, { "auxiliary_loss_clip": 0.01150615, "auxiliary_loss_mlp": 0.01024114, "balance_loss_clip": 1.04476333, "balance_loss_mlp": 1.0167706, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 1.7200582538463487, "language_loss": 0.82848376, "learning_rate": 1.040290264449196e-06, "loss": 0.85023105, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.5544047355651855 }, { "auxiliary_loss_clip": 0.01152235, "auxiliary_loss_mlp": 0.01022461, "balance_loss_clip": 1.05041766, "balance_loss_mlp": 1.01566601, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 1.808884667167318, "language_loss": 0.64315307, "learning_rate": 1.0396069081352532e-06, "loss": 0.6649, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 2.6171913146972656 }, { "auxiliary_loss_clip": 0.01059124, "auxiliary_loss_mlp": 0.01001158, "balance_loss_clip": 1.01173174, "balance_loss_mlp": 0.9999842, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.7818850517624751, "language_loss": 0.56076711, "learning_rate": 1.0389236975121782e-06, "loss": 0.58136988, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 3.004574775695801 }, { "auxiliary_loss_clip": 0.01169897, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.05038977, "balance_loss_mlp": 1.02121341, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 2.2131459592124334, "language_loss": 0.71234721, "learning_rate": 1.0382406326836147e-06, "loss": 0.73432839, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 3.8365190029144287 }, { "auxiliary_loss_clip": 0.01160506, "auxiliary_loss_mlp": 0.01027352, "balance_loss_clip": 1.05019534, "balance_loss_mlp": 1.01958227, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 1.855072805673821, "language_loss": 0.75773358, "learning_rate": 1.0375577137531828e-06, "loss": 0.77961218, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.4996402263641357 }, { "auxiliary_loss_clip": 0.01144964, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.04877305, "balance_loss_mlp": 1.02191186, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 1.516798380498256, "language_loss": 0.71746212, "learning_rate": 1.0368749408244802e-06, "loss": 0.73919994, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 2.609740972518921 }, { "auxiliary_loss_clip": 0.0114874, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.0479461, "balance_loss_mlp": 1.02145791, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 1.7979942128597632, "language_loss": 0.78789556, "learning_rate": 1.0361923140010836e-06, "loss": 0.80966723, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 4.084025621414185 }, { "auxiliary_loss_clip": 0.0115747, "auxiliary_loss_mlp": 0.01028597, "balance_loss_clip": 1.04720938, "balance_loss_mlp": 1.02112842, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 1.8746330906275772, "language_loss": 0.6320945, "learning_rate": 1.0355098333865455e-06, "loss": 0.65395516, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.503755807876587 }, { "auxiliary_loss_clip": 0.01148403, "auxiliary_loss_mlp": 0.010303, "balance_loss_clip": 1.04984331, "balance_loss_mlp": 1.0238744, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 1.7896202502925787, "language_loss": 0.69266891, "learning_rate": 1.0348274990844006e-06, "loss": 0.71445596, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.558018207550049 }, { "auxiliary_loss_clip": 0.01153887, "auxiliary_loss_mlp": 0.01028922, "balance_loss_clip": 1.05035889, "balance_loss_mlp": 1.02213049, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 1.9743005460654202, "language_loss": 0.72356331, "learning_rate": 1.034145311198155e-06, "loss": 0.74539137, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.5072097778320312 }, { "auxiliary_loss_clip": 0.01165712, "auxiliary_loss_mlp": 0.01027056, "balance_loss_clip": 1.04989886, "balance_loss_mlp": 1.02000439, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 2.024801785567019, "language_loss": 0.64129579, "learning_rate": 1.0334632698312989e-06, "loss": 0.66322351, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 2.4741666316986084 }, { "auxiliary_loss_clip": 0.01132905, "auxiliary_loss_mlp": 0.0102743, "balance_loss_clip": 1.04525304, "balance_loss_mlp": 1.02024126, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 2.068881561404667, "language_loss": 0.75310671, "learning_rate": 1.032781375087295e-06, "loss": 0.77471006, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.53546142578125 }, { "auxiliary_loss_clip": 0.01142093, "auxiliary_loss_mlp": 0.01024688, "balance_loss_clip": 1.04964995, "balance_loss_mlp": 1.01820338, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.358191038607805, "language_loss": 0.67313826, "learning_rate": 1.0320996270695891e-06, "loss": 0.69480604, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.5693914890289307 }, { "auxiliary_loss_clip": 0.01126307, "auxiliary_loss_mlp": 0.01025477, "balance_loss_clip": 1.043571, "balance_loss_mlp": 1.01826215, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 1.6442564703658582, "language_loss": 0.73293674, "learning_rate": 1.0314180258815998e-06, "loss": 0.75445461, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.585735321044922 }, { "auxiliary_loss_clip": 0.01114345, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.04184961, "balance_loss_mlp": 1.02121592, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 1.5125267838184913, "language_loss": 0.74144101, "learning_rate": 1.0307365716267247e-06, "loss": 0.76286489, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.6718838214874268 }, { "auxiliary_loss_clip": 0.01152481, "auxiliary_loss_mlp": 0.01025938, "balance_loss_clip": 1.04799795, "balance_loss_mlp": 1.01854157, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 2.089826249419647, "language_loss": 0.78489351, "learning_rate": 1.0300552644083423e-06, "loss": 0.8066777, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.520772933959961 }, { "auxiliary_loss_clip": 0.01127301, "auxiliary_loss_mlp": 0.01033137, "balance_loss_clip": 1.0463779, "balance_loss_mlp": 1.02587712, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 2.133466655843785, "language_loss": 0.72036791, "learning_rate": 1.0293741043298036e-06, "loss": 0.74197233, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.5382838249206543 }, { "auxiliary_loss_clip": 0.01129243, "auxiliary_loss_mlp": 0.01029458, "balance_loss_clip": 1.05068684, "balance_loss_mlp": 1.02186418, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 2.041853360570037, "language_loss": 0.71572477, "learning_rate": 1.0286930914944436e-06, "loss": 0.73731172, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.590501546859741 }, { "auxiliary_loss_clip": 0.01165305, "auxiliary_loss_mlp": 0.01024664, "balance_loss_clip": 1.04581845, "balance_loss_mlp": 1.01771164, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 2.599492601200961, "language_loss": 0.76944029, "learning_rate": 1.0280122260055684e-06, "loss": 0.79133999, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 2.4142444133758545 }, { "auxiliary_loss_clip": 0.01170369, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.05141199, "balance_loss_mlp": 1.02549613, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 2.4971675567811187, "language_loss": 0.82084751, "learning_rate": 1.0273315079664652e-06, "loss": 0.84288275, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.4471116065979004 }, { "auxiliary_loss_clip": 0.01158748, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.05074167, "balance_loss_mlp": 1.02257049, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 2.72391156285851, "language_loss": 0.74218911, "learning_rate": 1.0266509374803992e-06, "loss": 0.76407576, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 3.269397497177124 }, { "auxiliary_loss_clip": 0.01168759, "auxiliary_loss_mlp": 0.00761233, "balance_loss_clip": 1.05024743, "balance_loss_mlp": 1.00026107, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 2.501138013440124, "language_loss": 0.84629524, "learning_rate": 1.0259705146506123e-06, "loss": 0.86559522, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.433943748474121 }, { "auxiliary_loss_clip": 0.01156213, "auxiliary_loss_mlp": 0.01032152, "balance_loss_clip": 1.04870033, "balance_loss_mlp": 1.02483535, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 1.818213075841815, "language_loss": 0.77502263, "learning_rate": 1.025290239580324e-06, "loss": 0.79690629, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.5599887371063232 }, { "auxiliary_loss_clip": 0.01110054, "auxiliary_loss_mlp": 0.01029976, "balance_loss_clip": 1.04209065, "balance_loss_mlp": 1.02238846, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 1.590220842833504, "language_loss": 0.75230956, "learning_rate": 1.0246101123727313e-06, "loss": 0.77370989, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.5965328216552734 }, { "auxiliary_loss_clip": 0.011515, "auxiliary_loss_mlp": 0.01029133, "balance_loss_clip": 1.0459609, "balance_loss_mlp": 1.02253735, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 3.1481547306513478, "language_loss": 0.78647184, "learning_rate": 1.0239301331310085e-06, "loss": 0.8082782, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.453963279724121 }, { "auxiliary_loss_clip": 0.01152495, "auxiliary_loss_mlp": 0.01028078, "balance_loss_clip": 1.0494355, "balance_loss_mlp": 1.02129769, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 1.6689004464227335, "language_loss": 0.88513166, "learning_rate": 1.0232503019583088e-06, "loss": 0.90693736, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.508453130722046 }, { "auxiliary_loss_clip": 0.01149579, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.04842234, "balance_loss_mlp": 1.02366138, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 1.6506809819498072, "language_loss": 0.69840574, "learning_rate": 1.0225706189577619e-06, "loss": 0.72021019, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.5085299015045166 }, { "auxiliary_loss_clip": 0.01154909, "auxiliary_loss_mlp": 0.01021518, "balance_loss_clip": 1.05016541, "balance_loss_mlp": 1.01399946, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 1.8722159157473302, "language_loss": 0.746508, "learning_rate": 1.021891084232475e-06, "loss": 0.76827222, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.47799015045166 }, { "auxiliary_loss_clip": 0.01153065, "auxiliary_loss_mlp": 0.01027947, "balance_loss_clip": 1.04717815, "balance_loss_mlp": 1.02005827, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 2.740297985665008, "language_loss": 0.79495478, "learning_rate": 1.0212116978855325e-06, "loss": 0.81676483, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 2.4593453407287598 }, { "auxiliary_loss_clip": 0.01125024, "auxiliary_loss_mlp": 0.01022758, "balance_loss_clip": 1.04653907, "balance_loss_mlp": 1.01580167, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 1.6393770773005563, "language_loss": 0.78784627, "learning_rate": 1.020532460019997e-06, "loss": 0.80932409, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 3.361593246459961 }, { "auxiliary_loss_clip": 0.01087678, "auxiliary_loss_mlp": 0.01033343, "balance_loss_clip": 1.0431869, "balance_loss_mlp": 1.02625871, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 1.815399924540102, "language_loss": 0.70913053, "learning_rate": 1.0198533707389096e-06, "loss": 0.73034072, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 2.6984219551086426 }, { "auxiliary_loss_clip": 0.01150891, "auxiliary_loss_mlp": 0.00761015, "balance_loss_clip": 1.04896367, "balance_loss_mlp": 1.00022411, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 1.8458204456577636, "language_loss": 0.73297977, "learning_rate": 1.0191744301452853e-06, "loss": 0.75209892, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 2.5050244331359863 }, { "auxiliary_loss_clip": 0.0116548, "auxiliary_loss_mlp": 0.01029399, "balance_loss_clip": 1.04830146, "balance_loss_mlp": 1.0227828, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 1.8568092586394664, "language_loss": 0.70204842, "learning_rate": 1.0184956383421208e-06, "loss": 0.72399724, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 3.33374285697937 }, { "auxiliary_loss_clip": 0.01156555, "auxiliary_loss_mlp": 0.01022352, "balance_loss_clip": 1.04980803, "balance_loss_mlp": 1.01524127, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 1.8940662409910956, "language_loss": 0.65173364, "learning_rate": 1.017816995432387e-06, "loss": 0.67352271, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.4926583766937256 }, { "auxiliary_loss_clip": 0.01139294, "auxiliary_loss_mlp": 0.01026941, "balance_loss_clip": 1.04849255, "balance_loss_mlp": 1.01992226, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 1.819085015517038, "language_loss": 0.7397477, "learning_rate": 1.0171385015190353e-06, "loss": 0.76141006, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.548186779022217 }, { "auxiliary_loss_clip": 0.01131634, "auxiliary_loss_mlp": 0.00760118, "balance_loss_clip": 1.04677713, "balance_loss_mlp": 1.0002197, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 2.025685980620832, "language_loss": 0.73151445, "learning_rate": 1.0164601567049908e-06, "loss": 0.75043201, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.5246217250823975 }, { "auxiliary_loss_clip": 0.011387, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 1.04737306, "balance_loss_mlp": 1.02271533, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 1.646592162129762, "language_loss": 0.80185032, "learning_rate": 1.015781961093158e-06, "loss": 0.8235386, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 2.5261034965515137 }, { "auxiliary_loss_clip": 0.01141742, "auxiliary_loss_mlp": 0.01027064, "balance_loss_clip": 1.04388463, "balance_loss_mlp": 1.02017331, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.60725864575364, "language_loss": 0.76845413, "learning_rate": 1.0151039147864197e-06, "loss": 0.79014218, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.5183680057525635 }, { "auxiliary_loss_clip": 0.0107893, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.04428983, "balance_loss_mlp": 1.02276301, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 2.200586647834062, "language_loss": 0.65946084, "learning_rate": 1.0144260178876336e-06, "loss": 0.68055367, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.8401858806610107 }, { "auxiliary_loss_clip": 0.01145269, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.0471251, "balance_loss_mlp": 1.02190518, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 2.739664168652078, "language_loss": 0.67630696, "learning_rate": 1.0137482704996388e-06, "loss": 0.69804615, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.7237963676452637 }, { "auxiliary_loss_clip": 0.01131641, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.0475843, "balance_loss_mlp": 1.02177787, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 3.0630658034956038, "language_loss": 0.78620422, "learning_rate": 1.0130706727252461e-06, "loss": 0.80781329, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.571286916732788 }, { "auxiliary_loss_clip": 0.0112788, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.04521418, "balance_loss_mlp": 1.0195291, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 5.649036952668872, "language_loss": 0.68030095, "learning_rate": 1.0123932246672468e-06, "loss": 0.70184642, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.511312246322632 }, { "auxiliary_loss_clip": 0.01019744, "auxiliary_loss_mlp": 0.00751181, "balance_loss_clip": 1.01267087, "balance_loss_mlp": 1.00007105, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.7500119426160425, "language_loss": 0.55882072, "learning_rate": 1.0117159264284114e-06, "loss": 0.57652998, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.135748863220215 }, { "auxiliary_loss_clip": 0.01141336, "auxiliary_loss_mlp": 0.01031122, "balance_loss_clip": 1.04784787, "balance_loss_mlp": 1.02424979, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.6642417962332647, "language_loss": 0.77243227, "learning_rate": 1.0110387781114837e-06, "loss": 0.79415685, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 2.5182440280914307 }, { "auxiliary_loss_clip": 0.01166848, "auxiliary_loss_mlp": 0.01026538, "balance_loss_clip": 1.04969597, "balance_loss_mlp": 1.0194931, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 1.9051573700446875, "language_loss": 0.77147752, "learning_rate": 1.0103617798191872e-06, "loss": 0.79341137, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 2.4525206089019775 }, { "auxiliary_loss_clip": 0.0113383, "auxiliary_loss_mlp": 0.01026264, "balance_loss_clip": 1.0474391, "balance_loss_mlp": 1.01931977, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 2.231671946629498, "language_loss": 0.82251477, "learning_rate": 1.0096849316542217e-06, "loss": 0.84411573, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 3.5578079223632812 }, { "auxiliary_loss_clip": 0.01071236, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.03774261, "balance_loss_mlp": 1.01886189, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 2.2122038519486784, "language_loss": 0.74620622, "learning_rate": 1.0090082337192643e-06, "loss": 0.76717818, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.7532026767730713 }, { "auxiliary_loss_clip": 0.01091725, "auxiliary_loss_mlp": 0.01029368, "balance_loss_clip": 1.03738201, "balance_loss_mlp": 1.02271271, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 2.344904097156709, "language_loss": 0.78604239, "learning_rate": 1.0083316861169705e-06, "loss": 0.80725336, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.7281432151794434 }, { "auxiliary_loss_clip": 0.01130988, "auxiliary_loss_mlp": 0.01026237, "balance_loss_clip": 1.04426432, "balance_loss_mlp": 1.01837492, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 2.1742020250792655, "language_loss": 0.71401656, "learning_rate": 1.0076552889499713e-06, "loss": 0.73558879, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.615874767303467 }, { "auxiliary_loss_clip": 0.01153011, "auxiliary_loss_mlp": 0.01026031, "balance_loss_clip": 1.05025864, "balance_loss_mlp": 1.01874471, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 1.8079908130409788, "language_loss": 0.73238385, "learning_rate": 1.006979042320876e-06, "loss": 0.75417423, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.561631917953491 }, { "auxiliary_loss_clip": 0.01133962, "auxiliary_loss_mlp": 0.01025517, "balance_loss_clip": 1.04316115, "balance_loss_mlp": 1.01877856, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 1.9926272994884273, "language_loss": 0.62958562, "learning_rate": 1.0063029463322702e-06, "loss": 0.65118039, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.5540215969085693 }, { "auxiliary_loss_clip": 0.01103814, "auxiliary_loss_mlp": 0.00761173, "balance_loss_clip": 1.04059279, "balance_loss_mlp": 1.00025094, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.00728576785247, "language_loss": 0.75259781, "learning_rate": 1.0056270010867164e-06, "loss": 0.77124774, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 2.592349052429199 }, { "auxiliary_loss_clip": 0.01140104, "auxiliary_loss_mlp": 0.01026396, "balance_loss_clip": 1.04412746, "balance_loss_mlp": 1.01889455, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 2.3212290121060044, "language_loss": 0.78090179, "learning_rate": 1.004951206686758e-06, "loss": 0.80256677, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.5200958251953125 }, { "auxiliary_loss_clip": 0.01148898, "auxiliary_loss_mlp": 0.01028982, "balance_loss_clip": 1.04861355, "balance_loss_mlp": 1.02157283, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 1.9137869815505157, "language_loss": 0.71545345, "learning_rate": 1.0042755632349087e-06, "loss": 0.73723221, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 2.4919209480285645 }, { "auxiliary_loss_clip": 0.01126203, "auxiliary_loss_mlp": 0.01036462, "balance_loss_clip": 1.04683232, "balance_loss_mlp": 1.02886236, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 2.158112664638395, "language_loss": 0.62524354, "learning_rate": 1.0036000708336653e-06, "loss": 0.64687026, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 2.587871789932251 }, { "auxiliary_loss_clip": 0.01143138, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.04742992, "balance_loss_mlp": 1.02244687, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 2.0248863099934455, "language_loss": 0.79795134, "learning_rate": 1.0029247295854984e-06, "loss": 0.81967723, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 3.287555456161499 }, { "auxiliary_loss_clip": 0.01130323, "auxiliary_loss_mlp": 0.01027931, "balance_loss_clip": 1.0457654, "balance_loss_mlp": 1.02122295, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 1.8985606912453017, "language_loss": 0.71915501, "learning_rate": 1.0022495395928588e-06, "loss": 0.74073756, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.5237278938293457 }, { "auxiliary_loss_clip": 0.01058928, "auxiliary_loss_mlp": 0.01007389, "balance_loss_clip": 1.01137948, "balance_loss_mlp": 1.00629854, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.7941553470007131, "language_loss": 0.62379301, "learning_rate": 1.0015745009581697e-06, "loss": 0.64445621, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 3.103691577911377 }, { "auxiliary_loss_clip": 0.01151692, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.05043554, "balance_loss_mlp": 1.0252161, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 1.6626226566674922, "language_loss": 0.66980475, "learning_rate": 1.0008996137838343e-06, "loss": 0.69164538, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 3.266974687576294 }, { "auxiliary_loss_clip": 0.01171812, "auxiliary_loss_mlp": 0.0102789, "balance_loss_clip": 1.051368, "balance_loss_mlp": 1.0195899, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 2.382799619742517, "language_loss": 0.79655623, "learning_rate": 1.000224878172234e-06, "loss": 0.81855333, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.4623026847839355 }, { "auxiliary_loss_clip": 0.01153835, "auxiliary_loss_mlp": 0.01026899, "balance_loss_clip": 1.04951334, "balance_loss_mlp": 1.02007461, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 2.5316718897659283, "language_loss": 0.72528887, "learning_rate": 9.99550294225724e-07, "loss": 0.74709624, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.503897190093994 }, { "auxiliary_loss_clip": 0.01109311, "auxiliary_loss_mlp": 0.01024598, "balance_loss_clip": 1.03917241, "balance_loss_mlp": 1.01746976, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 3.2648796698543703, "language_loss": 0.72131795, "learning_rate": 9.988758620466402e-07, "loss": 0.74265707, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.570328712463379 }, { "auxiliary_loss_clip": 0.0109918, "auxiliary_loss_mlp": 0.01026491, "balance_loss_clip": 1.04356527, "balance_loss_mlp": 1.01983666, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.5500317886842934, "language_loss": 0.76286632, "learning_rate": 9.982015817372917e-07, "loss": 0.78412306, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 2.642570734024048 }, { "auxiliary_loss_clip": 0.01108442, "auxiliary_loss_mlp": 0.01028324, "balance_loss_clip": 1.04074121, "balance_loss_mlp": 1.02120769, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 1.8950674540593428, "language_loss": 0.82045722, "learning_rate": 9.975274533999657e-07, "loss": 0.84182489, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.597531318664551 }, { "auxiliary_loss_clip": 0.01166663, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.04847169, "balance_loss_mlp": 1.02273893, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 2.500745267480339, "language_loss": 0.83791614, "learning_rate": 9.96853477136929e-07, "loss": 0.8598901, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.421985387802124 }, { "auxiliary_loss_clip": 0.01114386, "auxiliary_loss_mlp": 0.01027447, "balance_loss_clip": 1.04152417, "balance_loss_mlp": 1.02066433, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 2.076387287775808, "language_loss": 0.7533766, "learning_rate": 9.96179653050422e-07, "loss": 0.77479494, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.5609376430511475 }, { "auxiliary_loss_clip": 0.01117954, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.04390585, "balance_loss_mlp": 1.02722406, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 2.107809326297555, "language_loss": 0.74360967, "learning_rate": 9.955059812426635e-07, "loss": 0.76513195, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.5468695163726807 }, { "auxiliary_loss_clip": 0.01166235, "auxiliary_loss_mlp": 0.01025762, "balance_loss_clip": 1.05083668, "balance_loss_mlp": 1.01894021, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 2.055614949299435, "language_loss": 0.82714647, "learning_rate": 9.948324618158493e-07, "loss": 0.84906644, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.494586944580078 }, { "auxiliary_loss_clip": 0.01153981, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.04545069, "balance_loss_mlp": 1.02315736, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.397582094042597, "language_loss": 0.77526677, "learning_rate": 9.941590948721502e-07, "loss": 0.79711914, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.4574460983276367 }, { "auxiliary_loss_clip": 0.01134762, "auxiliary_loss_mlp": 0.01024136, "balance_loss_clip": 1.0475297, "balance_loss_mlp": 1.01739156, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 1.7956688914882935, "language_loss": 0.76217723, "learning_rate": 9.934858805137188e-07, "loss": 0.78376621, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.5655081272125244 }, { "auxiliary_loss_clip": 0.01149741, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.04762554, "balance_loss_mlp": 1.02309632, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 1.6172305271493224, "language_loss": 0.80971402, "learning_rate": 9.92812818842677e-07, "loss": 0.83150995, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 2.5537209510803223 }, { "auxiliary_loss_clip": 0.01146534, "auxiliary_loss_mlp": 0.01026884, "balance_loss_clip": 1.04530311, "balance_loss_mlp": 1.02003813, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 1.6179425309043678, "language_loss": 0.64164066, "learning_rate": 9.921399099611306e-07, "loss": 0.6633749, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 3.4619028568267822 }, { "auxiliary_loss_clip": 0.01141515, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.04769456, "balance_loss_mlp": 1.02082539, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 1.6530889437661458, "language_loss": 0.68850207, "learning_rate": 9.914671539711588e-07, "loss": 0.71019852, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.562713146209717 }, { "auxiliary_loss_clip": 0.0107122, "auxiliary_loss_mlp": 0.00761056, "balance_loss_clip": 1.04218102, "balance_loss_mlp": 1.00023866, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 1.8342938812211296, "language_loss": 0.78478271, "learning_rate": 9.90794550974817e-07, "loss": 0.80310547, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 2.9581198692321777 }, { "auxiliary_loss_clip": 0.0112114, "auxiliary_loss_mlp": 0.01026589, "balance_loss_clip": 1.04469383, "balance_loss_mlp": 1.01955545, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 2.7032532320695712, "language_loss": 0.81421232, "learning_rate": 9.901221010741407e-07, "loss": 0.83568966, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 2.758780002593994 }, { "auxiliary_loss_clip": 0.01157657, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.04931164, "balance_loss_mlp": 1.01782298, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 2.1053609432870197, "language_loss": 0.74695045, "learning_rate": 9.894498043711375e-07, "loss": 0.76878071, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.588226079940796 }, { "auxiliary_loss_clip": 0.01135889, "auxiliary_loss_mlp": 0.01025074, "balance_loss_clip": 1.04364872, "balance_loss_mlp": 1.01797795, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 1.8348625494043846, "language_loss": 0.69369376, "learning_rate": 9.887776609677962e-07, "loss": 0.71530342, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.5913493633270264 }, { "auxiliary_loss_clip": 0.01114779, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.04068661, "balance_loss_mlp": 1.02455783, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 1.5922417622315646, "language_loss": 0.72441834, "learning_rate": 9.88105670966079e-07, "loss": 0.74587983, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 2.538856267929077 }, { "auxiliary_loss_clip": 0.01098953, "auxiliary_loss_mlp": 0.01020493, "balance_loss_clip": 1.04312825, "balance_loss_mlp": 1.0136919, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 1.8505555875579167, "language_loss": 0.78777945, "learning_rate": 9.874338344679283e-07, "loss": 0.80897391, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.5564157962799072 }, { "auxiliary_loss_clip": 0.01162691, "auxiliary_loss_mlp": 0.01024419, "balance_loss_clip": 1.04840326, "balance_loss_mlp": 1.01760042, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 1.7265046607893921, "language_loss": 0.73876077, "learning_rate": 9.86762151575259e-07, "loss": 0.76063186, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 2.471052408218384 }, { "auxiliary_loss_clip": 0.01111432, "auxiliary_loss_mlp": 0.00759518, "balance_loss_clip": 1.04590297, "balance_loss_mlp": 1.00021648, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.4570120984141495, "language_loss": 0.80214572, "learning_rate": 9.860906223899651e-07, "loss": 0.82085526, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 2.593214273452759 }, { "auxiliary_loss_clip": 0.01143552, "auxiliary_loss_mlp": 0.01028544, "balance_loss_clip": 1.04554212, "balance_loss_mlp": 1.02143335, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.5987802355111425, "language_loss": 0.75749528, "learning_rate": 9.854192470139184e-07, "loss": 0.77921629, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 3.3567605018615723 }, { "auxiliary_loss_clip": 0.01141105, "auxiliary_loss_mlp": 0.01038676, "balance_loss_clip": 1.04885244, "balance_loss_mlp": 1.03114772, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 2.2480835734300677, "language_loss": 0.71778071, "learning_rate": 9.847480255489645e-07, "loss": 0.73957849, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.5109007358551025 }, { "auxiliary_loss_clip": 0.01143977, "auxiliary_loss_mlp": 0.01028748, "balance_loss_clip": 1.0471828, "balance_loss_mlp": 1.02187002, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 1.6849769833165895, "language_loss": 0.68998837, "learning_rate": 9.840769580969295e-07, "loss": 0.71171564, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.559161424636841 }, { "auxiliary_loss_clip": 0.01143676, "auxiliary_loss_mlp": 0.01024288, "balance_loss_clip": 1.04555345, "balance_loss_mlp": 1.01742148, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 1.9116444023774068, "language_loss": 0.80077863, "learning_rate": 9.834060447596114e-07, "loss": 0.82245827, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 4.22889518737793 }, { "auxiliary_loss_clip": 0.01153827, "auxiliary_loss_mlp": 0.01025875, "balance_loss_clip": 1.0461266, "balance_loss_mlp": 1.01893377, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 2.2272921769026235, "language_loss": 0.78169626, "learning_rate": 9.827352856387868e-07, "loss": 0.80349326, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.547220468521118 }, { "auxiliary_loss_clip": 0.01014504, "auxiliary_loss_mlp": 0.01003005, "balance_loss_clip": 1.01182508, "balance_loss_mlp": 1.00196159, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.7823738579120485, "language_loss": 0.6431722, "learning_rate": 9.820646808362118e-07, "loss": 0.6633473, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.264039993286133 }, { "auxiliary_loss_clip": 0.01138134, "auxiliary_loss_mlp": 0.01029236, "balance_loss_clip": 1.04973948, "balance_loss_mlp": 1.02234256, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 2.6049060244326903, "language_loss": 0.72586119, "learning_rate": 9.813942304536154e-07, "loss": 0.74753493, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.501434803009033 }, { "auxiliary_loss_clip": 0.01140055, "auxiliary_loss_mlp": 0.01026507, "balance_loss_clip": 1.04576862, "balance_loss_mlp": 1.01952767, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 1.7989801559473582, "language_loss": 0.63698876, "learning_rate": 9.807239345927043e-07, "loss": 0.65865433, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.536226987838745 }, { "auxiliary_loss_clip": 0.01138536, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.04438472, "balance_loss_mlp": 1.02127934, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.0321165603182494, "language_loss": 0.72403836, "learning_rate": 9.80053793355162e-07, "loss": 0.74570525, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.5965347290039062 }, { "auxiliary_loss_clip": 0.01103326, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 1.04097557, "balance_loss_mlp": 1.01986408, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 2.2938597929919027, "language_loss": 0.75191844, "learning_rate": 9.793838068426472e-07, "loss": 0.77322221, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.5831503868103027 }, { "auxiliary_loss_clip": 0.01166918, "auxiliary_loss_mlp": 0.01026327, "balance_loss_clip": 1.05057955, "balance_loss_mlp": 1.01892436, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 2.7173849621934822, "language_loss": 0.61236942, "learning_rate": 9.78713975156799e-07, "loss": 0.63430178, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.4325311183929443 }, { "auxiliary_loss_clip": 0.01125207, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.04610777, "balance_loss_mlp": 1.02192533, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 1.8486801483951423, "language_loss": 0.71587217, "learning_rate": 9.780442983992273e-07, "loss": 0.73741853, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.6124396324157715 }, { "auxiliary_loss_clip": 0.01130801, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.04505086, "balance_loss_mlp": 1.02529788, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 1.6071573782146757, "language_loss": 0.71866477, "learning_rate": 9.773747766715238e-07, "loss": 0.74030244, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.6625874042510986 }, { "auxiliary_loss_clip": 0.01139651, "auxiliary_loss_mlp": 0.01021775, "balance_loss_clip": 1.04577529, "balance_loss_mlp": 1.01432443, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 1.7179918081846364, "language_loss": 0.79807717, "learning_rate": 9.767054100752536e-07, "loss": 0.81969142, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.525566577911377 }, { "auxiliary_loss_clip": 0.01123828, "auxiliary_loss_mlp": 0.01030101, "balance_loss_clip": 1.04616284, "balance_loss_mlp": 1.02283525, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 2.0901567309030833, "language_loss": 0.81161952, "learning_rate": 9.760361987119584e-07, "loss": 0.83315885, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 2.5406906604766846 }, { "auxiliary_loss_clip": 0.01136944, "auxiliary_loss_mlp": 0.01030694, "balance_loss_clip": 1.04802823, "balance_loss_mlp": 1.02250397, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 1.8382649777457272, "language_loss": 0.67841476, "learning_rate": 9.753671426831592e-07, "loss": 0.70009112, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 2.479050636291504 }, { "auxiliary_loss_clip": 0.01145355, "auxiliary_loss_mlp": 0.01026546, "balance_loss_clip": 1.04465723, "balance_loss_mlp": 1.01971579, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 1.7772225634655656, "language_loss": 0.79535538, "learning_rate": 9.746982420903483e-07, "loss": 0.81707436, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 3.391728639602661 }, { "auxiliary_loss_clip": 0.01150146, "auxiliary_loss_mlp": 0.01023564, "balance_loss_clip": 1.0494591, "balance_loss_mlp": 1.0164119, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 1.5682757718983629, "language_loss": 0.74863291, "learning_rate": 9.740294970349993e-07, "loss": 0.77037001, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.4695541858673096 }, { "auxiliary_loss_clip": 0.01039677, "auxiliary_loss_mlp": 0.01001153, "balance_loss_clip": 1.01083994, "balance_loss_mlp": 1.0000205, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.8836716635585617, "language_loss": 0.60894394, "learning_rate": 9.733609076185594e-07, "loss": 0.62935227, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.009120225906372 }, { "auxiliary_loss_clip": 0.01155378, "auxiliary_loss_mlp": 0.01027934, "balance_loss_clip": 1.04965425, "balance_loss_mlp": 1.02032852, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 1.7988610773594416, "language_loss": 0.83947885, "learning_rate": 9.72692473942455e-07, "loss": 0.86131203, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.473071813583374 }, { "auxiliary_loss_clip": 0.01114695, "auxiliary_loss_mlp": 0.01025102, "balance_loss_clip": 1.04689407, "balance_loss_mlp": 1.01764011, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 1.5675163728800097, "language_loss": 0.77683437, "learning_rate": 9.720241961080849e-07, "loss": 0.79823238, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.603996515274048 }, { "auxiliary_loss_clip": 0.01166815, "auxiliary_loss_mlp": 0.01028622, "balance_loss_clip": 1.04924011, "balance_loss_mlp": 1.02178514, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 1.947661232794189, "language_loss": 0.73007536, "learning_rate": 9.713560742168259e-07, "loss": 0.75202978, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.6379692554473877 }, { "auxiliary_loss_clip": 0.01118536, "auxiliary_loss_mlp": 0.01028356, "balance_loss_clip": 1.04423833, "balance_loss_mlp": 1.02146554, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 2.003643706207869, "language_loss": 0.71132737, "learning_rate": 9.706881083700333e-07, "loss": 0.73279631, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 2.570873975753784 }, { "auxiliary_loss_clip": 0.01095701, "auxiliary_loss_mlp": 0.01025766, "balance_loss_clip": 1.04605794, "balance_loss_mlp": 1.01819587, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 2.1700822220827387, "language_loss": 0.82479972, "learning_rate": 9.700202986690357e-07, "loss": 0.84601438, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.6114470958709717 }, { "auxiliary_loss_clip": 0.01151184, "auxiliary_loss_mlp": 0.00761161, "balance_loss_clip": 1.04810286, "balance_loss_mlp": 1.00023961, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 4.131740241173308, "language_loss": 0.66546011, "learning_rate": 9.693526452151413e-07, "loss": 0.68458354, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 2.5052146911621094 }, { "auxiliary_loss_clip": 0.01129369, "auxiliary_loss_mlp": 0.01024514, "balance_loss_clip": 1.04344487, "balance_loss_mlp": 1.01658916, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 1.6501846886148293, "language_loss": 0.75593495, "learning_rate": 9.686851481096305e-07, "loss": 0.77747369, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 3.4217255115509033 }, { "auxiliary_loss_clip": 0.01090648, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.04193497, "balance_loss_mlp": 1.02007174, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 2.038719204676254, "language_loss": 0.72020006, "learning_rate": 9.68017807453762e-07, "loss": 0.74137449, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 2.6241455078125 }, { "auxiliary_loss_clip": 0.01142604, "auxiliary_loss_mlp": 0.00760393, "balance_loss_clip": 1.04835248, "balance_loss_mlp": 1.00020611, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 1.8425440908073412, "language_loss": 0.73037696, "learning_rate": 9.673506233487721e-07, "loss": 0.74940693, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 2.5134241580963135 }, { "auxiliary_loss_clip": 0.011378, "auxiliary_loss_mlp": 0.00759719, "balance_loss_clip": 1.04486632, "balance_loss_mlp": 1.00021887, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 1.6220014649058054, "language_loss": 0.85952222, "learning_rate": 9.666835958958717e-07, "loss": 0.87849742, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 3.4203591346740723 }, { "auxiliary_loss_clip": 0.01165394, "auxiliary_loss_mlp": 0.01023423, "balance_loss_clip": 1.04916883, "balance_loss_mlp": 1.01656294, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 2.9023815656510132, "language_loss": 0.80719233, "learning_rate": 9.660167251962484e-07, "loss": 0.82908052, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 3.1669881343841553 }, { "auxiliary_loss_clip": 0.01126764, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.04494977, "balance_loss_mlp": 1.01855469, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 1.5755214379908924, "language_loss": 0.77823997, "learning_rate": 9.653500113510654e-07, "loss": 0.79976714, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.5586845874786377 }, { "auxiliary_loss_clip": 0.01133224, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.04401433, "balance_loss_mlp": 1.02114546, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 2.2016656460012305, "language_loss": 0.67460656, "learning_rate": 9.646834544614627e-07, "loss": 0.69622552, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 2.55533766746521 }, { "auxiliary_loss_clip": 0.0113169, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04764128, "balance_loss_mlp": 1.01826, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 1.8578506392204794, "language_loss": 0.76461303, "learning_rate": 9.64017054628558e-07, "loss": 0.78618217, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.512789011001587 }, { "auxiliary_loss_clip": 0.01114414, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.04334009, "balance_loss_mlp": 1.01868176, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 1.5941441557914793, "language_loss": 0.78459799, "learning_rate": 9.63350811953441e-07, "loss": 0.8059991, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.6013052463531494 }, { "auxiliary_loss_clip": 0.01126072, "auxiliary_loss_mlp": 0.01024027, "balance_loss_clip": 1.04457724, "balance_loss_mlp": 1.01708031, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 2.0652471508641073, "language_loss": 0.7053473, "learning_rate": 9.626847265371826e-07, "loss": 0.72684824, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.5576019287109375 }, { "auxiliary_loss_clip": 0.01128974, "auxiliary_loss_mlp": 0.01030835, "balance_loss_clip": 1.04240894, "balance_loss_mlp": 1.02292871, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 1.8999893229702642, "language_loss": 0.784688, "learning_rate": 9.620187984808262e-07, "loss": 0.8062861, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.5262887477874756 }, { "auxiliary_loss_clip": 0.01136191, "auxiliary_loss_mlp": 0.00760703, "balance_loss_clip": 1.04593861, "balance_loss_mlp": 1.00020552, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 1.9296193042927574, "language_loss": 0.85909057, "learning_rate": 9.613530278853919e-07, "loss": 0.87805951, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.54351544380188 }, { "auxiliary_loss_clip": 0.01148906, "auxiliary_loss_mlp": 0.01029945, "balance_loss_clip": 1.04742265, "balance_loss_mlp": 1.02292311, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 1.8982122784811597, "language_loss": 0.74644858, "learning_rate": 9.60687414851879e-07, "loss": 0.76823711, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.517876625061035 }, { "auxiliary_loss_clip": 0.01136863, "auxiliary_loss_mlp": 0.0102696, "balance_loss_clip": 1.04710639, "balance_loss_mlp": 1.01929474, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.136565312108994, "language_loss": 0.76912403, "learning_rate": 9.600219594812575e-07, "loss": 0.79076219, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.4824867248535156 }, { "auxiliary_loss_clip": 0.01163313, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.04771364, "balance_loss_mlp": 1.01788485, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 1.6122664594782505, "language_loss": 0.72600925, "learning_rate": 9.593566618744786e-07, "loss": 0.74789017, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.471177339553833 }, { "auxiliary_loss_clip": 0.01164777, "auxiliary_loss_mlp": 0.01024016, "balance_loss_clip": 1.04760373, "balance_loss_mlp": 1.01714611, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.5908140554140784, "language_loss": 0.74031466, "learning_rate": 9.58691522132466e-07, "loss": 0.76220262, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.4723119735717773 }, { "auxiliary_loss_clip": 0.01144566, "auxiliary_loss_mlp": 0.01028136, "balance_loss_clip": 1.04910791, "balance_loss_mlp": 1.02106667, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 1.96896194936858, "language_loss": 0.84506124, "learning_rate": 9.58026540356123e-07, "loss": 0.86678821, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.5415971279144287 }, { "auxiliary_loss_clip": 0.01151498, "auxiliary_loss_mlp": 0.01025976, "balance_loss_clip": 1.04520035, "balance_loss_mlp": 1.01837027, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.6137864958644448, "language_loss": 0.86558896, "learning_rate": 9.573617166463246e-07, "loss": 0.88736373, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 3.3195462226867676 }, { "auxiliary_loss_clip": 0.01138605, "auxiliary_loss_mlp": 0.01024419, "balance_loss_clip": 1.04456234, "balance_loss_mlp": 1.01693821, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 1.9053040072080132, "language_loss": 0.60345274, "learning_rate": 9.56697051103924e-07, "loss": 0.62508297, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.5237269401550293 }, { "auxiliary_loss_clip": 0.01135848, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.04533195, "balance_loss_mlp": 1.01875651, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 1.7779145595680794, "language_loss": 0.80785573, "learning_rate": 9.560325438297522e-07, "loss": 0.82947284, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.5646586418151855 }, { "auxiliary_loss_clip": 0.01138211, "auxiliary_loss_mlp": 0.01028227, "balance_loss_clip": 1.04869306, "balance_loss_mlp": 1.02169704, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 1.9005157970986026, "language_loss": 0.86649841, "learning_rate": 9.553681949246127e-07, "loss": 0.88816285, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.502432346343994 }, { "auxiliary_loss_clip": 0.01133004, "auxiliary_loss_mlp": 0.01030335, "balance_loss_clip": 1.0486325, "balance_loss_mlp": 1.02204394, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 2.150539961280656, "language_loss": 0.75007784, "learning_rate": 9.547040044892886e-07, "loss": 0.77171123, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 2.8399417400360107 }, { "auxiliary_loss_clip": 0.01050097, "auxiliary_loss_mlp": 0.01001401, "balance_loss_clip": 1.01063955, "balance_loss_mlp": 1.00034034, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.864901599711069, "language_loss": 0.60177034, "learning_rate": 9.540399726245354e-07, "loss": 0.62228531, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 2.968353509902954 }, { "auxiliary_loss_clip": 0.0113527, "auxiliary_loss_mlp": 0.01029819, "balance_loss_clip": 1.04552543, "balance_loss_mlp": 1.0219872, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 1.9013705695185932, "language_loss": 0.68832725, "learning_rate": 9.533760994310859e-07, "loss": 0.70997816, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.5716166496276855 }, { "auxiliary_loss_clip": 0.01168351, "auxiliary_loss_mlp": 0.01025756, "balance_loss_clip": 1.05062366, "balance_loss_mlp": 1.01847577, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 1.98070628751398, "language_loss": 0.7477383, "learning_rate": 9.527123850096508e-07, "loss": 0.76967937, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.4416208267211914 }, { "auxiliary_loss_clip": 0.01153577, "auxiliary_loss_mlp": 0.01028327, "balance_loss_clip": 1.04820704, "balance_loss_mlp": 1.02096891, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 1.7856231498541275, "language_loss": 0.71774113, "learning_rate": 9.520488294609142e-07, "loss": 0.73956019, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 2.5128629207611084 }, { "auxiliary_loss_clip": 0.01016131, "auxiliary_loss_mlp": 0.01002788, "balance_loss_clip": 1.01026869, "balance_loss_mlp": 1.0015955, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.7401045916684383, "language_loss": 0.53870606, "learning_rate": 9.513854328855368e-07, "loss": 0.55889529, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 3.1216955184936523 }, { "auxiliary_loss_clip": 0.0116241, "auxiliary_loss_mlp": 0.01021951, "balance_loss_clip": 1.04722857, "balance_loss_mlp": 1.01460147, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 1.8902997397583672, "language_loss": 0.8096053, "learning_rate": 9.507221953841558e-07, "loss": 0.83144891, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 3.239274263381958 }, { "auxiliary_loss_clip": 0.01155713, "auxiliary_loss_mlp": 0.01028233, "balance_loss_clip": 1.05078673, "balance_loss_mlp": 1.02047896, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 1.6413448193191258, "language_loss": 0.78001034, "learning_rate": 9.500591170573824e-07, "loss": 0.80184972, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.5046093463897705 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.04462862, "balance_loss_mlp": 1.02124023, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 1.8557634551056017, "language_loss": 0.74102861, "learning_rate": 9.493961980058078e-07, "loss": 0.76241809, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 2.5681872367858887 }, { "auxiliary_loss_clip": 0.01081553, "auxiliary_loss_mlp": 0.01024726, "balance_loss_clip": 1.03793788, "balance_loss_mlp": 1.01807141, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 2.1610589259119592, "language_loss": 0.6733427, "learning_rate": 9.48733438329993e-07, "loss": 0.69440544, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 4.335917234420776 }, { "auxiliary_loss_clip": 0.01165531, "auxiliary_loss_mlp": 0.00760694, "balance_loss_clip": 1.05103517, "balance_loss_mlp": 1.00019825, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.6392451437394708, "language_loss": 0.74553013, "learning_rate": 9.480708381304807e-07, "loss": 0.76479232, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.540844678878784 }, { "auxiliary_loss_clip": 0.01108765, "auxiliary_loss_mlp": 0.01028648, "balance_loss_clip": 1.04675865, "balance_loss_mlp": 1.02131701, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 3.1038666377538555, "language_loss": 0.83632052, "learning_rate": 9.474083975077858e-07, "loss": 0.85769463, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 2.568183183670044 }, { "auxiliary_loss_clip": 0.01143979, "auxiliary_loss_mlp": 0.01021611, "balance_loss_clip": 1.04503477, "balance_loss_mlp": 1.01429141, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 6.695634033673204, "language_loss": 0.79853654, "learning_rate": 9.467461165623994e-07, "loss": 0.82019246, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.501483917236328 }, { "auxiliary_loss_clip": 0.01152054, "auxiliary_loss_mlp": 0.01026147, "balance_loss_clip": 1.04480195, "balance_loss_mlp": 1.01923549, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 1.8173769854706732, "language_loss": 0.79838681, "learning_rate": 9.46083995394791e-07, "loss": 0.82016879, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.519155263900757 }, { "auxiliary_loss_clip": 0.01152219, "auxiliary_loss_mlp": 0.00760361, "balance_loss_clip": 1.04709446, "balance_loss_mlp": 1.00021708, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 2.6910946360441725, "language_loss": 0.63014579, "learning_rate": 9.454220341054012e-07, "loss": 0.64927161, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.6425440311431885 }, { "auxiliary_loss_clip": 0.01125337, "auxiliary_loss_mlp": 0.01025609, "balance_loss_clip": 1.04563737, "balance_loss_mlp": 1.01806879, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 1.8980088562320436, "language_loss": 0.80772674, "learning_rate": 9.447602327946512e-07, "loss": 0.82923621, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.5407516956329346 }, { "auxiliary_loss_clip": 0.01135689, "auxiliary_loss_mlp": 0.0102988, "balance_loss_clip": 1.04427576, "balance_loss_mlp": 1.0226531, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 1.8879728530309865, "language_loss": 0.7636835, "learning_rate": 9.440985915629338e-07, "loss": 0.78533918, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.5184476375579834 }, { "auxiliary_loss_clip": 0.0116805, "auxiliary_loss_mlp": 0.01027186, "balance_loss_clip": 1.05247533, "balance_loss_mlp": 1.01987004, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 1.8122391798254844, "language_loss": 0.73123628, "learning_rate": 9.434371105106223e-07, "loss": 0.75318861, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.441215991973877 }, { "auxiliary_loss_clip": 0.0112163, "auxiliary_loss_mlp": 0.01024091, "balance_loss_clip": 1.04381871, "balance_loss_mlp": 1.0173502, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 1.81442563635407, "language_loss": 0.70659763, "learning_rate": 9.427757897380602e-07, "loss": 0.72805488, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.591341733932495 }, { "auxiliary_loss_clip": 0.01120486, "auxiliary_loss_mlp": 0.01024596, "balance_loss_clip": 1.04435217, "balance_loss_mlp": 1.01700854, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 2.326256140573583, "language_loss": 0.84764057, "learning_rate": 9.421146293455695e-07, "loss": 0.86909139, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.5438804626464844 }, { "auxiliary_loss_clip": 0.01136844, "auxiliary_loss_mlp": 0.01033631, "balance_loss_clip": 1.04657006, "balance_loss_mlp": 1.02611518, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 2.28277672256521, "language_loss": 0.68300164, "learning_rate": 9.414536294334489e-07, "loss": 0.70470643, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.5342860221862793 }, { "auxiliary_loss_clip": 0.01139129, "auxiliary_loss_mlp": 0.01025616, "balance_loss_clip": 1.04364014, "balance_loss_mlp": 1.01885438, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 1.7969527618959578, "language_loss": 0.69725823, "learning_rate": 9.407927901019708e-07, "loss": 0.71890569, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.5146777629852295 }, { "auxiliary_loss_clip": 0.01152605, "auxiliary_loss_mlp": 0.01024543, "balance_loss_clip": 1.04751706, "balance_loss_mlp": 1.01755738, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 1.973169604322113, "language_loss": 0.76846039, "learning_rate": 9.401321114513854e-07, "loss": 0.79023182, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 3.2731316089630127 }, { "auxiliary_loss_clip": 0.01166904, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.05015016, "balance_loss_mlp": 1.02019501, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 1.730916092835809, "language_loss": 0.75332522, "learning_rate": 9.394715935819155e-07, "loss": 0.7752704, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.480463743209839 }, { "auxiliary_loss_clip": 0.01156586, "auxiliary_loss_mlp": 0.01024006, "balance_loss_clip": 1.04826593, "balance_loss_mlp": 1.01699066, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 1.8256219986346391, "language_loss": 0.62707502, "learning_rate": 9.388112365937608e-07, "loss": 0.64888096, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.530275821685791 }, { "auxiliary_loss_clip": 0.01124894, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.04478407, "balance_loss_mlp": 1.03511298, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 2.229840447013276, "language_loss": 0.82707143, "learning_rate": 9.381510405870985e-07, "loss": 0.84874821, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.535473346710205 }, { "auxiliary_loss_clip": 0.01153736, "auxiliary_loss_mlp": 0.01026952, "balance_loss_clip": 1.04994202, "balance_loss_mlp": 1.0191381, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 3.4194654884094096, "language_loss": 0.7736553, "learning_rate": 9.374910056620791e-07, "loss": 0.79546225, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.477207660675049 }, { "auxiliary_loss_clip": 0.01155783, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 1.05028844, "balance_loss_mlp": 1.02347028, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 2.344875550638085, "language_loss": 0.81039751, "learning_rate": 9.368311319188293e-07, "loss": 0.83226287, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.486459493637085 }, { "auxiliary_loss_clip": 0.01123253, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 1.04438496, "balance_loss_mlp": 1.02020025, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 1.693252059409962, "language_loss": 0.7936244, "learning_rate": 9.361714194574515e-07, "loss": 0.81512719, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.624188184738159 }, { "auxiliary_loss_clip": 0.01057697, "auxiliary_loss_mlp": 0.01000958, "balance_loss_clip": 1.00993156, "balance_loss_mlp": 0.9998911, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.7629704581325396, "language_loss": 0.58348477, "learning_rate": 9.355118683780228e-07, "loss": 0.60407138, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.16556978225708 }, { "auxiliary_loss_clip": 0.0116783, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.04947019, "balance_loss_mlp": 1.02406788, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 2.8283392893316663, "language_loss": 0.79192966, "learning_rate": 9.348524787805987e-07, "loss": 0.81392181, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 2.425603151321411 }, { "auxiliary_loss_clip": 0.01125693, "auxiliary_loss_mlp": 0.01027318, "balance_loss_clip": 1.04237247, "balance_loss_mlp": 1.01985002, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 2.7898642324005967, "language_loss": 0.84876698, "learning_rate": 9.341932507652053e-07, "loss": 0.87029707, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 2.5247151851654053 }, { "auxiliary_loss_clip": 0.01134689, "auxiliary_loss_mlp": 0.01026672, "balance_loss_clip": 1.04238462, "balance_loss_mlp": 1.01879251, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 1.6825044361732022, "language_loss": 0.78562248, "learning_rate": 9.335341844318489e-07, "loss": 0.80723602, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 3.38222599029541 }, { "auxiliary_loss_clip": 0.01136486, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.04652894, "balance_loss_mlp": 1.02047145, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 1.7214054067080398, "language_loss": 0.73218048, "learning_rate": 9.328752798805091e-07, "loss": 0.7538231, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 2.5493662357330322 }, { "auxiliary_loss_clip": 0.01153602, "auxiliary_loss_mlp": 0.01029835, "balance_loss_clip": 1.0493803, "balance_loss_mlp": 1.02239609, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 2.1542192628929158, "language_loss": 0.75673175, "learning_rate": 9.322165372111399e-07, "loss": 0.77856612, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 3.3181333541870117 }, { "auxiliary_loss_clip": 0.01121442, "auxiliary_loss_mlp": 0.01028151, "balance_loss_clip": 1.04601169, "balance_loss_mlp": 1.02110255, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 1.9759455018304592, "language_loss": 0.75579858, "learning_rate": 9.315579565236747e-07, "loss": 0.77729446, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 3.3290390968322754 }, { "auxiliary_loss_clip": 0.01136338, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.04792428, "balance_loss_mlp": 1.02363372, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 2.1152947795277655, "language_loss": 0.74255574, "learning_rate": 9.308995379180162e-07, "loss": 0.76422471, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.5412368774414062 }, { "auxiliary_loss_clip": 0.01048876, "auxiliary_loss_mlp": 0.01000581, "balance_loss_clip": 1.0096215, "balance_loss_mlp": 0.99944896, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.7401446330248045, "language_loss": 0.5953989, "learning_rate": 9.302412814940488e-07, "loss": 0.61589348, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 3.1396541595458984 }, { "auxiliary_loss_clip": 0.01137474, "auxiliary_loss_mlp": 0.01024939, "balance_loss_clip": 1.0446943, "balance_loss_mlp": 1.01736915, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 2.1347819835232844, "language_loss": 0.71340138, "learning_rate": 9.295831873516276e-07, "loss": 0.73502553, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.541996955871582 }, { "auxiliary_loss_clip": 0.01166325, "auxiliary_loss_mlp": 0.01024185, "balance_loss_clip": 1.05103803, "balance_loss_mlp": 1.01703799, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 2.0102870533987036, "language_loss": 0.76134324, "learning_rate": 9.289252555905873e-07, "loss": 0.78324831, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.4620800018310547 }, { "auxiliary_loss_clip": 0.01155252, "auxiliary_loss_mlp": 0.01031587, "balance_loss_clip": 1.05111241, "balance_loss_mlp": 1.02428555, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 1.8569828080358672, "language_loss": 0.75686371, "learning_rate": 9.282674863107334e-07, "loss": 0.77873206, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.49055814743042 }, { "auxiliary_loss_clip": 0.01148919, "auxiliary_loss_mlp": 0.01026601, "balance_loss_clip": 1.04844213, "balance_loss_mlp": 1.01951408, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 2.477312700318787, "language_loss": 0.7578975, "learning_rate": 9.276098796118488e-07, "loss": 0.77965266, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.45066499710083 }, { "auxiliary_loss_clip": 0.01137122, "auxiliary_loss_mlp": 0.01022163, "balance_loss_clip": 1.04663515, "balance_loss_mlp": 1.01506722, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 1.732943225997359, "language_loss": 0.66242653, "learning_rate": 9.269524355936938e-07, "loss": 0.68401945, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.6173627376556396 }, { "auxiliary_loss_clip": 0.01131646, "auxiliary_loss_mlp": 0.01024587, "balance_loss_clip": 1.04287481, "balance_loss_mlp": 1.01760995, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.7024815078908677, "language_loss": 0.85150766, "learning_rate": 9.262951543560002e-07, "loss": 0.87307, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.528764247894287 }, { "auxiliary_loss_clip": 0.01139218, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.04997778, "balance_loss_mlp": 1.02735412, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.2228202177294683, "language_loss": 0.85902929, "learning_rate": 9.256380359984795e-07, "loss": 0.88077044, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.4776101112365723 }, { "auxiliary_loss_clip": 0.01117374, "auxiliary_loss_mlp": 0.01024675, "balance_loss_clip": 1.04089355, "balance_loss_mlp": 1.01736116, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 1.77334945872231, "language_loss": 0.74368823, "learning_rate": 9.249810806208139e-07, "loss": 0.7651087, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.7054927349090576 }, { "auxiliary_loss_clip": 0.01107066, "auxiliary_loss_mlp": 0.00760415, "balance_loss_clip": 1.03845954, "balance_loss_mlp": 1.00023735, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 1.926815259565751, "language_loss": 0.80127066, "learning_rate": 9.243242883226627e-07, "loss": 0.81994551, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.5816588401794434 }, { "auxiliary_loss_clip": 0.0115456, "auxiliary_loss_mlp": 0.01026079, "balance_loss_clip": 1.04450166, "balance_loss_mlp": 1.01834249, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 1.756565120371662, "language_loss": 0.69491482, "learning_rate": 9.236676592036628e-07, "loss": 0.71672118, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.5537710189819336 }, { "auxiliary_loss_clip": 0.01137743, "auxiliary_loss_mlp": 0.01024637, "balance_loss_clip": 1.0504117, "balance_loss_mlp": 1.01746619, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.7368069349114752, "language_loss": 0.73636407, "learning_rate": 9.230111933634228e-07, "loss": 0.75798786, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 3.376593828201294 }, { "auxiliary_loss_clip": 0.01157333, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.05115724, "balance_loss_mlp": 1.02401042, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.5491217838116822, "language_loss": 0.80557334, "learning_rate": 9.223548909015288e-07, "loss": 0.82746041, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.5351738929748535 }, { "auxiliary_loss_clip": 0.01104795, "auxiliary_loss_mlp": 0.0103055, "balance_loss_clip": 1.04376888, "balance_loss_mlp": 1.02379966, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 1.8054295318970235, "language_loss": 0.7180391, "learning_rate": 9.216987519175407e-07, "loss": 0.73939252, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.626535177230835 }, { "auxiliary_loss_clip": 0.01146995, "auxiliary_loss_mlp": 0.01024023, "balance_loss_clip": 1.04758334, "balance_loss_mlp": 1.01664126, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 1.5879151496259174, "language_loss": 0.68259799, "learning_rate": 9.210427765109942e-07, "loss": 0.70430815, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.490229606628418 }, { "auxiliary_loss_clip": 0.01138253, "auxiliary_loss_mlp": 0.01032623, "balance_loss_clip": 1.04463732, "balance_loss_mlp": 1.0250001, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 3.441334886836369, "language_loss": 0.81636655, "learning_rate": 9.20386964781402e-07, "loss": 0.83807528, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.528012752532959 }, { "auxiliary_loss_clip": 0.01136347, "auxiliary_loss_mlp": 0.01029206, "balance_loss_clip": 1.04535162, "balance_loss_mlp": 1.02157021, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 2.5925726725448857, "language_loss": 0.84636551, "learning_rate": 9.197313168282472e-07, "loss": 0.86802107, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.554379940032959 }, { "auxiliary_loss_clip": 0.01146882, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.04441667, "balance_loss_mlp": 1.02206016, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 3.3948231664646764, "language_loss": 0.71999443, "learning_rate": 9.190758327509935e-07, "loss": 0.74175858, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 2.510038137435913 }, { "auxiliary_loss_clip": 0.01019319, "auxiliary_loss_mlp": 0.00751307, "balance_loss_clip": 1.01276302, "balance_loss_mlp": 1.0001061, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 0.9268056157586627, "language_loss": 0.64469326, "learning_rate": 9.184205126490767e-07, "loss": 0.66239941, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 2.9966542720794678 }, { "auxiliary_loss_clip": 0.0103025, "auxiliary_loss_mlp": 0.00751179, "balance_loss_clip": 1.01048756, "balance_loss_mlp": 1.00002694, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.112088164373328, "language_loss": 0.5967145, "learning_rate": 9.177653566219075e-07, "loss": 0.61452878, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 3.0298500061035156 }, { "auxiliary_loss_clip": 0.01127851, "auxiliary_loss_mlp": 0.01027952, "balance_loss_clip": 1.04264474, "balance_loss_mlp": 1.02055252, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 2.086506029357863, "language_loss": 0.75620037, "learning_rate": 9.171103647688744e-07, "loss": 0.77775842, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.5580313205718994 }, { "auxiliary_loss_clip": 0.01073691, "auxiliary_loss_mlp": 0.01026766, "balance_loss_clip": 1.03857088, "balance_loss_mlp": 1.01995611, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 3.3424390289817736, "language_loss": 0.68924171, "learning_rate": 9.164555371893367e-07, "loss": 0.71024632, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 3.4237396717071533 }, { "auxiliary_loss_clip": 0.01151898, "auxiliary_loss_mlp": 0.00760017, "balance_loss_clip": 1.04760933, "balance_loss_mlp": 1.00022769, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 1.869484816953531, "language_loss": 0.74994606, "learning_rate": 9.158008739826333e-07, "loss": 0.7690652, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 2.4832136631011963 }, { "auxiliary_loss_clip": 0.01136767, "auxiliary_loss_mlp": 0.01034513, "balance_loss_clip": 1.04690266, "balance_loss_mlp": 1.02720213, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 1.548373761498636, "language_loss": 0.86625338, "learning_rate": 9.151463752480744e-07, "loss": 0.88796622, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 3.395021438598633 }, { "auxiliary_loss_clip": 0.0111494, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.04363871, "balance_loss_mlp": 1.02061582, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.3771447696298464, "language_loss": 0.80349791, "learning_rate": 9.144920410849493e-07, "loss": 0.82492423, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 3.3393185138702393 }, { "auxiliary_loss_clip": 0.01144161, "auxiliary_loss_mlp": 0.01024398, "balance_loss_clip": 1.0472964, "balance_loss_mlp": 1.01710534, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 1.913948526839435, "language_loss": 0.80599356, "learning_rate": 9.138378715925176e-07, "loss": 0.82767922, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.530811309814453 }, { "auxiliary_loss_clip": 0.01132139, "auxiliary_loss_mlp": 0.01027845, "balance_loss_clip": 1.04373765, "balance_loss_mlp": 1.02029324, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 1.6404207756645195, "language_loss": 0.8086257, "learning_rate": 9.131838668700167e-07, "loss": 0.83022559, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 2.534411907196045 }, { "auxiliary_loss_clip": 0.01121952, "auxiliary_loss_mlp": 0.0102529, "balance_loss_clip": 1.04271126, "balance_loss_mlp": 1.0177319, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 1.716142850470053, "language_loss": 0.86380702, "learning_rate": 9.125300270166598e-07, "loss": 0.88527942, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.59421968460083 }, { "auxiliary_loss_clip": 0.01128518, "auxiliary_loss_mlp": 0.01024648, "balance_loss_clip": 1.04391289, "balance_loss_mlp": 1.01754594, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.6981209311535594, "language_loss": 0.8576951, "learning_rate": 9.118763521316324e-07, "loss": 0.87922674, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.6021220684051514 }, { "auxiliary_loss_clip": 0.01166478, "auxiliary_loss_mlp": 0.00760814, "balance_loss_clip": 1.0481956, "balance_loss_mlp": 1.00020564, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 1.5422521662049735, "language_loss": 0.76197231, "learning_rate": 9.112228423140987e-07, "loss": 0.78124523, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.4581542015075684 }, { "auxiliary_loss_clip": 0.01142284, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.04612255, "balance_loss_mlp": 1.02168441, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 2.2946724632605657, "language_loss": 0.86477536, "learning_rate": 9.105694976631932e-07, "loss": 0.88649189, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.544137716293335 }, { "auxiliary_loss_clip": 0.01155256, "auxiliary_loss_mlp": 0.01022126, "balance_loss_clip": 1.05047154, "balance_loss_mlp": 1.01549816, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 2.0024764046055523, "language_loss": 0.72764099, "learning_rate": 9.099163182780283e-07, "loss": 0.7494148, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.513568639755249 }, { "auxiliary_loss_clip": 0.01135002, "auxiliary_loss_mlp": 0.01031447, "balance_loss_clip": 1.04698849, "balance_loss_mlp": 1.02450323, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 2.979453153673986, "language_loss": 0.48804629, "learning_rate": 9.092633042576916e-07, "loss": 0.50971079, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.489847421646118 }, { "auxiliary_loss_clip": 0.01140231, "auxiliary_loss_mlp": 0.01024743, "balance_loss_clip": 1.05029774, "balance_loss_mlp": 1.01783228, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 1.9542100702187768, "language_loss": 0.55829382, "learning_rate": 9.086104557012446e-07, "loss": 0.57994354, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.602405071258545 }, { "auxiliary_loss_clip": 0.01143328, "auxiliary_loss_mlp": 0.01024036, "balance_loss_clip": 1.04605198, "balance_loss_mlp": 1.01699615, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 1.8121072065876174, "language_loss": 0.65813488, "learning_rate": 9.079577727077239e-07, "loss": 0.67980856, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 2.505826711654663 }, { "auxiliary_loss_clip": 0.01153524, "auxiliary_loss_mlp": 0.01028309, "balance_loss_clip": 1.04785573, "balance_loss_mlp": 1.02075088, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 2.044699950799127, "language_loss": 0.71895641, "learning_rate": 9.073052553761404e-07, "loss": 0.74077475, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.5104804039001465 }, { "auxiliary_loss_clip": 0.01113938, "auxiliary_loss_mlp": 0.01022284, "balance_loss_clip": 1.04355717, "balance_loss_mlp": 1.01460052, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 1.5904178488206369, "language_loss": 0.78221267, "learning_rate": 9.066529038054805e-07, "loss": 0.80357492, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.6197214126586914 }, { "auxiliary_loss_clip": 0.01137075, "auxiliary_loss_mlp": 0.01025585, "balance_loss_clip": 1.04582739, "balance_loss_mlp": 1.01918089, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 1.7028796858108963, "language_loss": 0.73843378, "learning_rate": 9.060007180947071e-07, "loss": 0.76006043, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 3.3316965103149414 }, { "auxiliary_loss_clip": 0.01106749, "auxiliary_loss_mlp": 0.01026449, "balance_loss_clip": 1.03931403, "balance_loss_mlp": 1.01872444, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 2.569058229895464, "language_loss": 0.73034328, "learning_rate": 9.053486983427534e-07, "loss": 0.75167525, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.667313575744629 }, { "auxiliary_loss_clip": 0.01144825, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.04714537, "balance_loss_mlp": 1.02103436, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 1.788468537288641, "language_loss": 0.7060408, "learning_rate": 9.046968446485326e-07, "loss": 0.72776896, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.5142362117767334 }, { "auxiliary_loss_clip": 0.01156985, "auxiliary_loss_mlp": 0.01029736, "balance_loss_clip": 1.04905438, "balance_loss_mlp": 1.02153981, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 2.3075660714802786, "language_loss": 0.70356917, "learning_rate": 9.040451571109295e-07, "loss": 0.72543627, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.4545812606811523 }, { "auxiliary_loss_clip": 0.01021994, "auxiliary_loss_mlp": 0.01001371, "balance_loss_clip": 1.00844491, "balance_loss_mlp": 1.00028038, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.8276997743815718, "language_loss": 0.60385895, "learning_rate": 9.033936358288042e-07, "loss": 0.62409258, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.0159482955932617 }, { "auxiliary_loss_clip": 0.01168923, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.05043828, "balance_loss_mlp": 1.01879609, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 1.5751878367666334, "language_loss": 0.82283241, "learning_rate": 9.027422809009937e-07, "loss": 0.84477663, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.5000133514404297 }, { "auxiliary_loss_clip": 0.01152694, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.04500747, "balance_loss_mlp": 1.02361774, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 1.561970681331322, "language_loss": 0.83204174, "learning_rate": 9.020910924263054e-07, "loss": 0.85387689, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.4969499111175537 }, { "auxiliary_loss_clip": 0.01020285, "auxiliary_loss_mlp": 0.0100137, "balance_loss_clip": 1.00840592, "balance_loss_mlp": 1.00023746, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 0.8218927528050084, "language_loss": 0.58166867, "learning_rate": 9.014400705035261e-07, "loss": 0.6018852, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.176245927810669 }, { "auxiliary_loss_clip": 0.0116735, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.05123854, "balance_loss_mlp": 1.01949942, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 1.870377877803489, "language_loss": 0.76448852, "learning_rate": 9.00789215231414e-07, "loss": 0.78643113, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 2.450627565383911 }, { "auxiliary_loss_clip": 0.01122008, "auxiliary_loss_mlp": 0.00761234, "balance_loss_clip": 1.04038525, "balance_loss_mlp": 1.000193, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 1.8085806150652846, "language_loss": 0.81715369, "learning_rate": 9.001385267087056e-07, "loss": 0.83598614, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.5647757053375244 }, { "auxiliary_loss_clip": 0.01157353, "auxiliary_loss_mlp": 0.01027376, "balance_loss_clip": 1.05036998, "balance_loss_mlp": 1.02036059, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.4730761752565158, "language_loss": 0.70301723, "learning_rate": 8.994880050341072e-07, "loss": 0.72486454, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 3.2798702716827393 }, { "auxiliary_loss_clip": 0.01132889, "auxiliary_loss_mlp": 0.01036425, "balance_loss_clip": 1.04597723, "balance_loss_mlp": 1.02902269, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 1.7246428544315826, "language_loss": 0.77391571, "learning_rate": 8.988376503063026e-07, "loss": 0.79560882, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 2.579139232635498 }, { "auxiliary_loss_clip": 0.0111982, "auxiliary_loss_mlp": 0.01025592, "balance_loss_clip": 1.04590178, "balance_loss_mlp": 1.01790333, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 2.5855876339743507, "language_loss": 0.81502253, "learning_rate": 8.981874626239521e-07, "loss": 0.83647668, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 3.368726968765259 }, { "auxiliary_loss_clip": 0.01156242, "auxiliary_loss_mlp": 0.01035422, "balance_loss_clip": 1.05069363, "balance_loss_mlp": 1.02796507, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 1.9070884902838607, "language_loss": 0.88525248, "learning_rate": 8.975374420856872e-07, "loss": 0.9071691, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 3.2339119911193848 }, { "auxiliary_loss_clip": 0.01115648, "auxiliary_loss_mlp": 0.01024172, "balance_loss_clip": 1.04168558, "balance_loss_mlp": 1.01667356, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 2.2694039503162413, "language_loss": 0.72529227, "learning_rate": 8.968875887901157e-07, "loss": 0.74669051, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 2.5403735637664795 }, { "auxiliary_loss_clip": 0.01139984, "auxiliary_loss_mlp": 0.0103106, "balance_loss_clip": 1.04504251, "balance_loss_mlp": 1.02350807, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 1.796993001160796, "language_loss": 0.62148094, "learning_rate": 8.9623790283582e-07, "loss": 0.6431914, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.550201177597046 }, { "auxiliary_loss_clip": 0.0112852, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.04702401, "balance_loss_mlp": 1.02675462, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 1.9500221153847093, "language_loss": 0.76325321, "learning_rate": 8.955883843213561e-07, "loss": 0.78487772, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.5740325450897217 }, { "auxiliary_loss_clip": 0.01162492, "auxiliary_loss_mlp": 0.01030506, "balance_loss_clip": 1.0501318, "balance_loss_mlp": 1.02271605, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 1.7977836921335948, "language_loss": 0.86733103, "learning_rate": 8.949390333452569e-07, "loss": 0.88926101, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.4971866607666016 }, { "auxiliary_loss_clip": 0.01167303, "auxiliary_loss_mlp": 0.0103211, "balance_loss_clip": 1.05040681, "balance_loss_mlp": 1.02503538, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 2.06581412377045, "language_loss": 0.67747438, "learning_rate": 8.942898500060279e-07, "loss": 0.69946849, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.522145986557007 }, { "auxiliary_loss_clip": 0.011167, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.04581547, "balance_loss_mlp": 1.02693212, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 4.081670175532175, "language_loss": 0.71256196, "learning_rate": 8.936408344021493e-07, "loss": 0.73407149, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.629964590072632 }, { "auxiliary_loss_clip": 0.01145917, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.0502218, "balance_loss_mlp": 1.02543736, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 2.254324620653938, "language_loss": 0.71200734, "learning_rate": 8.929919866320765e-07, "loss": 0.73379576, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.724989891052246 }, { "auxiliary_loss_clip": 0.01131454, "auxiliary_loss_mlp": 0.00761693, "balance_loss_clip": 1.04522133, "balance_loss_mlp": 1.00017595, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 1.872096031518265, "language_loss": 0.81174505, "learning_rate": 8.923433067942385e-07, "loss": 0.83067656, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.5689969062805176 }, { "auxiliary_loss_clip": 0.01131778, "auxiliary_loss_mlp": 0.01029979, "balance_loss_clip": 1.0484556, "balance_loss_mlp": 1.02249539, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 1.983868673448697, "language_loss": 0.6856277, "learning_rate": 8.916947949870417e-07, "loss": 0.70724523, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.555908441543579 }, { "auxiliary_loss_clip": 0.01048802, "auxiliary_loss_mlp": 0.01000939, "balance_loss_clip": 1.00962245, "balance_loss_mlp": 0.9998126, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.7393180438891565, "language_loss": 0.58109879, "learning_rate": 8.910464513088615e-07, "loss": 0.60159624, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.1899287700653076 }, { "auxiliary_loss_clip": 0.01134755, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.04666054, "balance_loss_mlp": 1.0230931, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 1.8140604490714414, "language_loss": 0.78513747, "learning_rate": 8.903982758580542e-07, "loss": 0.80679399, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 2.524005174636841 }, { "auxiliary_loss_clip": 0.01132425, "auxiliary_loss_mlp": 0.01025292, "balance_loss_clip": 1.04396737, "balance_loss_mlp": 1.01803756, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 1.9294533179211195, "language_loss": 0.80509031, "learning_rate": 8.897502687329457e-07, "loss": 0.82666743, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.539825677871704 }, { "auxiliary_loss_clip": 0.01120303, "auxiliary_loss_mlp": 0.01030551, "balance_loss_clip": 1.0431751, "balance_loss_mlp": 1.02339578, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 1.9036001341227715, "language_loss": 0.799164, "learning_rate": 8.891024300318382e-07, "loss": 0.82067257, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 3.347975015640259 }, { "auxiliary_loss_clip": 0.01117112, "auxiliary_loss_mlp": 0.01026731, "balance_loss_clip": 1.04245162, "balance_loss_mlp": 1.02012956, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 1.4160816358124628, "language_loss": 0.75874621, "learning_rate": 8.884547598530103e-07, "loss": 0.78018463, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.771071195602417 }, { "auxiliary_loss_clip": 0.01071724, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.03796029, "balance_loss_mlp": 1.019593, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 1.8229623382749671, "language_loss": 0.75096893, "learning_rate": 8.8780725829471e-07, "loss": 0.77195442, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.6394848823547363 }, { "auxiliary_loss_clip": 0.01167765, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.04950523, "balance_loss_mlp": 1.02552891, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 1.758212336818437, "language_loss": 0.78020328, "learning_rate": 8.87159925455165e-07, "loss": 0.80221128, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.461414337158203 }, { "auxiliary_loss_clip": 0.01121866, "auxiliary_loss_mlp": 0.01025459, "balance_loss_clip": 1.04454195, "balance_loss_mlp": 1.01845586, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 1.8353313739091295, "language_loss": 0.73191297, "learning_rate": 8.865127614325738e-07, "loss": 0.7533862, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.5503647327423096 }, { "auxiliary_loss_clip": 0.01131247, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.04298735, "balance_loss_mlp": 1.02458465, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 1.7197523792300766, "language_loss": 0.66769058, "learning_rate": 8.85865766325113e-07, "loss": 0.68932915, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.675478219985962 }, { "auxiliary_loss_clip": 0.01135178, "auxiliary_loss_mlp": 0.01028236, "balance_loss_clip": 1.04574418, "balance_loss_mlp": 1.02106285, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 2.5630787160092727, "language_loss": 0.71849018, "learning_rate": 8.852189402309287e-07, "loss": 0.74012429, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.579350709915161 }, { "auxiliary_loss_clip": 0.01154822, "auxiliary_loss_mlp": 0.01026283, "balance_loss_clip": 1.05144083, "balance_loss_mlp": 1.01944065, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 2.0031023016269565, "language_loss": 0.73839939, "learning_rate": 8.845722832481441e-07, "loss": 0.76021039, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 2.4543538093566895 }, { "auxiliary_loss_clip": 0.0115238, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 1.04875088, "balance_loss_mlp": 1.02539515, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 1.74198669929843, "language_loss": 0.77000058, "learning_rate": 8.83925795474858e-07, "loss": 0.79185641, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 2.5304338932037354 }, { "auxiliary_loss_clip": 0.01124046, "auxiliary_loss_mlp": 0.01032507, "balance_loss_clip": 1.04799771, "balance_loss_mlp": 1.02501428, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 2.1002700245293386, "language_loss": 0.58731067, "learning_rate": 8.832794770091414e-07, "loss": 0.60887623, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 2.6071298122406006 }, { "auxiliary_loss_clip": 0.01142207, "auxiliary_loss_mlp": 0.01026802, "balance_loss_clip": 1.04473281, "balance_loss_mlp": 1.0202843, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 1.9165332500855048, "language_loss": 0.82098019, "learning_rate": 8.826333279490401e-07, "loss": 0.84267032, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 3.311262607574463 }, { "auxiliary_loss_clip": 0.01143534, "auxiliary_loss_mlp": 0.01026617, "balance_loss_clip": 1.04911828, "balance_loss_mlp": 1.01908875, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 1.980368275503191, "language_loss": 0.68050158, "learning_rate": 8.819873483925748e-07, "loss": 0.70220315, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 2.5157883167266846 }, { "auxiliary_loss_clip": 0.01131221, "auxiliary_loss_mlp": 0.007605, "balance_loss_clip": 1.04708123, "balance_loss_mlp": 1.00015974, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 1.8202533774014515, "language_loss": 0.74286604, "learning_rate": 8.81341538437739e-07, "loss": 0.76178324, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 3.402418613433838 }, { "auxiliary_loss_clip": 0.01143384, "auxiliary_loss_mlp": 0.01022692, "balance_loss_clip": 1.04508853, "balance_loss_mlp": 1.01540875, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 1.8554290740263364, "language_loss": 0.68085206, "learning_rate": 8.80695898182503e-07, "loss": 0.7025128, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 3.3935585021972656 }, { "auxiliary_loss_clip": 0.01040503, "auxiliary_loss_mlp": 0.01001898, "balance_loss_clip": 1.00893462, "balance_loss_mlp": 1.00084925, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.8251627906677085, "language_loss": 0.65138257, "learning_rate": 8.800504277248093e-07, "loss": 0.67180657, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.0882134437561035 }, { "auxiliary_loss_clip": 0.0112054, "auxiliary_loss_mlp": 0.00760453, "balance_loss_clip": 1.04870939, "balance_loss_mlp": 1.0001713, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 1.667063018018832, "language_loss": 0.74956727, "learning_rate": 8.794051271625753e-07, "loss": 0.76837724, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 2.525453567504883 }, { "auxiliary_loss_clip": 0.01139275, "auxiliary_loss_mlp": 0.0102751, "balance_loss_clip": 1.0454464, "balance_loss_mlp": 1.02027369, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 1.554309501488968, "language_loss": 0.82934266, "learning_rate": 8.787599965936925e-07, "loss": 0.8510105, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.540602445602417 }, { "auxiliary_loss_clip": 0.01122694, "auxiliary_loss_mlp": 0.0102432, "balance_loss_clip": 1.04706717, "balance_loss_mlp": 1.01714683, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.7781028879873573, "language_loss": 0.7207495, "learning_rate": 8.781150361160261e-07, "loss": 0.74221957, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.697218656539917 }, { "auxiliary_loss_clip": 0.01125493, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.0470953, "balance_loss_mlp": 1.02355731, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 1.6998201816230187, "language_loss": 0.73635674, "learning_rate": 8.774702458274181e-07, "loss": 0.75791985, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.562938928604126 }, { "auxiliary_loss_clip": 0.01148776, "auxiliary_loss_mlp": 0.01023648, "balance_loss_clip": 1.04590285, "balance_loss_mlp": 1.01650739, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 2.256991388508272, "language_loss": 0.70674235, "learning_rate": 8.768256258256799e-07, "loss": 0.72846657, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.4377193450927734 }, { "auxiliary_loss_clip": 0.01155159, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 1.04804659, "balance_loss_mlp": 1.01885283, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.709806744181285, "language_loss": 0.73955011, "learning_rate": 8.76181176208602e-07, "loss": 0.76136094, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.482174873352051 }, { "auxiliary_loss_clip": 0.01097516, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.03858399, "balance_loss_mlp": 1.02115631, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 1.723583608420724, "language_loss": 0.73329335, "learning_rate": 8.755368970739461e-07, "loss": 0.75456035, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.6492061614990234 }, { "auxiliary_loss_clip": 0.01128843, "auxiliary_loss_mlp": 0.01024183, "balance_loss_clip": 1.0421592, "balance_loss_mlp": 1.01643991, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 2.209335058612384, "language_loss": 0.61344159, "learning_rate": 8.748927885194479e-07, "loss": 0.63497186, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.536794424057007 }, { "auxiliary_loss_clip": 0.01022299, "auxiliary_loss_mlp": 0.01001556, "balance_loss_clip": 1.01156878, "balance_loss_mlp": 1.00033462, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.800800386397483, "language_loss": 0.57423663, "learning_rate": 8.742488506428209e-07, "loss": 0.59447515, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.100766658782959 }, { "auxiliary_loss_clip": 0.01140886, "auxiliary_loss_mlp": 0.00760849, "balance_loss_clip": 1.04391372, "balance_loss_mlp": 1.00013852, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 2.318750953148414, "language_loss": 0.7815032, "learning_rate": 8.736050835417466e-07, "loss": 0.80052054, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 2.5597891807556152 }, { "auxiliary_loss_clip": 0.01158275, "auxiliary_loss_mlp": 0.01028124, "balance_loss_clip": 1.05068779, "balance_loss_mlp": 1.02080488, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 7.416729308829997, "language_loss": 0.61136651, "learning_rate": 8.729614873138862e-07, "loss": 0.63323045, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.4939725399017334 }, { "auxiliary_loss_clip": 0.01117358, "auxiliary_loss_mlp": 0.01026722, "balance_loss_clip": 1.04673553, "balance_loss_mlp": 1.01908374, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 1.9889710053702145, "language_loss": 0.77558291, "learning_rate": 8.723180620568716e-07, "loss": 0.79702377, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.5891356468200684 }, { "auxiliary_loss_clip": 0.01140057, "auxiliary_loss_mlp": 0.01027717, "balance_loss_clip": 1.04658163, "balance_loss_mlp": 1.02085042, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 1.9410457932721552, "language_loss": 0.85160106, "learning_rate": 8.716748078683116e-07, "loss": 0.8732788, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 3.299949884414673 }, { "auxiliary_loss_clip": 0.01073295, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.03860855, "balance_loss_mlp": 1.0219537, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 2.0103081119463493, "language_loss": 0.68484998, "learning_rate": 8.710317248457855e-07, "loss": 0.70588088, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.7237963676452637 }, { "auxiliary_loss_clip": 0.0113444, "auxiliary_loss_mlp": 0.01028602, "balance_loss_clip": 1.0484606, "balance_loss_mlp": 1.02102923, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 1.7158994952690103, "language_loss": 0.72314167, "learning_rate": 8.703888130868482e-07, "loss": 0.74477214, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.824151039123535 }, { "auxiliary_loss_clip": 0.0112507, "auxiliary_loss_mlp": 0.01030219, "balance_loss_clip": 1.04604101, "balance_loss_mlp": 1.02290559, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 10.389204859002279, "language_loss": 0.82144964, "learning_rate": 8.697460726890307e-07, "loss": 0.84300256, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.5360069274902344 }, { "auxiliary_loss_clip": 0.01121246, "auxiliary_loss_mlp": 0.00760797, "balance_loss_clip": 1.04128885, "balance_loss_mlp": 1.00016761, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 2.015459502124115, "language_loss": 0.90569198, "learning_rate": 8.691035037498354e-07, "loss": 0.92451239, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.5637917518615723 }, { "auxiliary_loss_clip": 0.01135136, "auxiliary_loss_mlp": 0.01027709, "balance_loss_clip": 1.04379833, "balance_loss_mlp": 1.02063417, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 1.6077474272061392, "language_loss": 0.72206962, "learning_rate": 8.684611063667391e-07, "loss": 0.74369806, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.5230302810668945 }, { "auxiliary_loss_clip": 0.01150448, "auxiliary_loss_mlp": 0.01025443, "balance_loss_clip": 1.04536366, "balance_loss_mlp": 1.01829934, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 4.347869231692518, "language_loss": 0.76887155, "learning_rate": 8.678188806371935e-07, "loss": 0.7906304, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.5747458934783936 }, { "auxiliary_loss_clip": 0.0115092, "auxiliary_loss_mlp": 0.01030015, "balance_loss_clip": 1.04647946, "balance_loss_mlp": 1.02363169, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 1.7303015009828693, "language_loss": 0.85509479, "learning_rate": 8.671768266586228e-07, "loss": 0.87690413, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 2.4542412757873535 }, { "auxiliary_loss_clip": 0.01122875, "auxiliary_loss_mlp": 0.01027682, "balance_loss_clip": 1.04507101, "balance_loss_mlp": 1.02032995, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 2.1809614796688837, "language_loss": 0.77901095, "learning_rate": 8.665349445284275e-07, "loss": 0.80051649, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 2.6131842136383057 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.0454011, "balance_loss_mlp": 1.01685703, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.5914114058583368, "language_loss": 0.80896455, "learning_rate": 8.658932343439799e-07, "loss": 0.83044457, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 3.3624064922332764 }, { "auxiliary_loss_clip": 0.01168234, "auxiliary_loss_mlp": 0.01025367, "balance_loss_clip": 1.05207503, "balance_loss_mlp": 1.01828051, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 2.7038116477894283, "language_loss": 0.77791321, "learning_rate": 8.65251696202627e-07, "loss": 0.79984921, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 2.467630624771118 }, { "auxiliary_loss_clip": 0.01123632, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.04619431, "balance_loss_mlp": 1.02175915, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 2.05243756724098, "language_loss": 0.87693346, "learning_rate": 8.646103302016896e-07, "loss": 0.89846408, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 4.374024152755737 }, { "auxiliary_loss_clip": 0.01120336, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.04256976, "balance_loss_mlp": 1.01791215, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 1.6737092885690472, "language_loss": 0.88233256, "learning_rate": 8.639691364384614e-07, "loss": 0.9037962, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.577583074569702 }, { "auxiliary_loss_clip": 0.01144367, "auxiliary_loss_mlp": 0.01024903, "balance_loss_clip": 1.04934692, "balance_loss_mlp": 1.01707411, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 1.7329353642345469, "language_loss": 0.72947764, "learning_rate": 8.633281150102136e-07, "loss": 0.75117028, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.4972727298736572 }, { "auxiliary_loss_clip": 0.01139762, "auxiliary_loss_mlp": 0.01029328, "balance_loss_clip": 1.04738498, "balance_loss_mlp": 1.02206516, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 2.2297452069803056, "language_loss": 0.67720175, "learning_rate": 8.626872660141855e-07, "loss": 0.69889259, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 2.480466842651367 }, { "auxiliary_loss_clip": 0.01114482, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.04612553, "balance_loss_mlp": 1.02345085, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 1.805058169306628, "language_loss": 0.74666572, "learning_rate": 8.620465895475957e-07, "loss": 0.76811576, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 2.5641074180603027 }, { "auxiliary_loss_clip": 0.01106556, "auxiliary_loss_mlp": 0.01026182, "balance_loss_clip": 1.04415405, "balance_loss_mlp": 1.0195179, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 1.4771786835200609, "language_loss": 0.74928373, "learning_rate": 8.614060857076333e-07, "loss": 0.77061111, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.6109323501586914 }, { "auxiliary_loss_clip": 0.01136844, "auxiliary_loss_mlp": 0.01035368, "balance_loss_clip": 1.04506087, "balance_loss_mlp": 1.02828693, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 1.782012910396407, "language_loss": 0.74713439, "learning_rate": 8.60765754591462e-07, "loss": 0.76885653, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.534482479095459 }, { "auxiliary_loss_clip": 0.01164315, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.04736912, "balance_loss_mlp": 1.01944113, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 1.7553078608548847, "language_loss": 0.72872043, "learning_rate": 8.601255962962211e-07, "loss": 0.75062931, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.4771130084991455 }, { "auxiliary_loss_clip": 0.01161641, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.05065227, "balance_loss_mlp": 1.02395749, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.361359719373076, "language_loss": 0.71627361, "learning_rate": 8.594856109190194e-07, "loss": 0.7382127, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.4574127197265625 }, { "auxiliary_loss_clip": 0.01166325, "auxiliary_loss_mlp": 0.01022315, "balance_loss_clip": 1.04860735, "balance_loss_mlp": 1.01502562, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 1.5633934692384597, "language_loss": 0.69064808, "learning_rate": 8.588457985569446e-07, "loss": 0.71253455, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.599175214767456 }, { "auxiliary_loss_clip": 0.01168815, "auxiliary_loss_mlp": 0.01028384, "balance_loss_clip": 1.05070829, "balance_loss_mlp": 1.02090371, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 6.7565518560093745, "language_loss": 0.71670163, "learning_rate": 8.582061593070542e-07, "loss": 0.73867363, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.4649415016174316 }, { "auxiliary_loss_clip": 0.01168946, "auxiliary_loss_mlp": 0.00761028, "balance_loss_clip": 1.0499804, "balance_loss_mlp": 1.00020158, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 2.069856854353654, "language_loss": 0.76855522, "learning_rate": 8.57566693266383e-07, "loss": 0.78785491, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.464463472366333 }, { "auxiliary_loss_clip": 0.01142302, "auxiliary_loss_mlp": 0.0076084, "balance_loss_clip": 1.04408979, "balance_loss_mlp": 1.00016999, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 2.2888845349366074, "language_loss": 0.6908474, "learning_rate": 8.569274005319354e-07, "loss": 0.7098788, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 2.528874635696411 }, { "auxiliary_loss_clip": 0.0114973, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.04759622, "balance_loss_mlp": 1.02206099, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 1.6928417620050253, "language_loss": 0.79827845, "learning_rate": 8.562882812006913e-07, "loss": 0.82006478, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.4733903408050537 }, { "auxiliary_loss_clip": 0.01165792, "auxiliary_loss_mlp": 0.01033097, "balance_loss_clip": 1.04988468, "balance_loss_mlp": 1.02576888, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 2.008758616515949, "language_loss": 0.77360368, "learning_rate": 8.556493353696066e-07, "loss": 0.79559261, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 3.288814067840576 }, { "auxiliary_loss_clip": 0.01158155, "auxiliary_loss_mlp": 0.00761201, "balance_loss_clip": 1.04999495, "balance_loss_mlp": 1.00017869, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 2.3253242169129553, "language_loss": 0.67533851, "learning_rate": 8.550105631356077e-07, "loss": 0.69453204, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.54215669631958 }, { "auxiliary_loss_clip": 0.01119916, "auxiliary_loss_mlp": 0.01023138, "balance_loss_clip": 1.04292631, "balance_loss_mlp": 1.01610148, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 2.1438639449993637, "language_loss": 0.77168387, "learning_rate": 8.543719645955961e-07, "loss": 0.79311436, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.5561394691467285 }, { "auxiliary_loss_clip": 0.01139824, "auxiliary_loss_mlp": 0.01026472, "balance_loss_clip": 1.0449003, "balance_loss_mlp": 1.01942372, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.5726345943522109, "language_loss": 0.74317092, "learning_rate": 8.537335398464467e-07, "loss": 0.76483387, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.551536798477173 }, { "auxiliary_loss_clip": 0.01136793, "auxiliary_loss_mlp": 0.01028489, "balance_loss_clip": 1.04284787, "balance_loss_mlp": 1.02099681, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 2.5988442581358258, "language_loss": 0.85414803, "learning_rate": 8.53095288985007e-07, "loss": 0.87580085, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.4996259212493896 }, { "auxiliary_loss_clip": 0.01166573, "auxiliary_loss_mlp": 0.01023143, "balance_loss_clip": 1.05114532, "balance_loss_mlp": 1.01661921, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 1.6379293516023261, "language_loss": 0.82240051, "learning_rate": 8.524572121081009e-07, "loss": 0.84429771, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.465022087097168 }, { "auxiliary_loss_clip": 0.01155864, "auxiliary_loss_mlp": 0.01035076, "balance_loss_clip": 1.04683709, "balance_loss_mlp": 1.02757215, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 2.3772022839489404, "language_loss": 0.62222338, "learning_rate": 8.518193093125232e-07, "loss": 0.64413273, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.501202344894409 }, { "auxiliary_loss_clip": 0.01144477, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.04724956, "balance_loss_mlp": 1.02070284, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 1.643712436434686, "language_loss": 0.80690074, "learning_rate": 8.511815806950436e-07, "loss": 0.8286202, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.5687015056610107 }, { "auxiliary_loss_clip": 0.01148179, "auxiliary_loss_mlp": 0.01026715, "balance_loss_clip": 1.0444175, "balance_loss_mlp": 1.01989675, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 1.8883968287257795, "language_loss": 0.77893436, "learning_rate": 8.505440263524044e-07, "loss": 0.80068332, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 2.4825944900512695 }, { "auxiliary_loss_clip": 0.01154425, "auxiliary_loss_mlp": 0.01031728, "balance_loss_clip": 1.04669535, "balance_loss_mlp": 1.02342534, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 2.8303404423686622, "language_loss": 0.87788528, "learning_rate": 8.49906646381322e-07, "loss": 0.89974678, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 3.223134756088257 }, { "auxiliary_loss_clip": 0.01125785, "auxiliary_loss_mlp": 0.01027957, "balance_loss_clip": 1.04697669, "balance_loss_mlp": 1.02110219, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 1.7500516844237126, "language_loss": 0.71863431, "learning_rate": 8.492694408784884e-07, "loss": 0.74017167, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 2.5636167526245117 }, { "auxiliary_loss_clip": 0.01158313, "auxiliary_loss_mlp": 0.01023894, "balance_loss_clip": 1.04999137, "balance_loss_mlp": 1.01675928, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 2.643907696542298, "language_loss": 0.62652957, "learning_rate": 8.486324099405642e-07, "loss": 0.64835167, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 2.4577834606170654 }, { "auxiliary_loss_clip": 0.01152786, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.04696536, "balance_loss_mlp": 1.02206147, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 1.6086773927101337, "language_loss": 0.74761856, "learning_rate": 8.479955536641887e-07, "loss": 0.76943684, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 4.106623888015747 }, { "auxiliary_loss_clip": 0.01130372, "auxiliary_loss_mlp": 0.01029775, "balance_loss_clip": 1.04088163, "balance_loss_mlp": 1.02271461, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 3.3525152385614043, "language_loss": 0.66147757, "learning_rate": 8.473588721459716e-07, "loss": 0.683079, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 2.5856990814208984 }, { "auxiliary_loss_clip": 0.01157522, "auxiliary_loss_mlp": 0.01031802, "balance_loss_clip": 1.05072761, "balance_loss_mlp": 1.02351677, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 3.0905451455387887, "language_loss": 0.70779622, "learning_rate": 8.467223654824967e-07, "loss": 0.72968948, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.5104033946990967 }, { "auxiliary_loss_clip": 0.0114849, "auxiliary_loss_mlp": 0.01029175, "balance_loss_clip": 1.04786158, "balance_loss_mlp": 1.02195072, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 1.815347003049604, "language_loss": 0.62156832, "learning_rate": 8.460860337703233e-07, "loss": 0.64334488, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 2.75278639793396 }, { "auxiliary_loss_clip": 0.01112428, "auxiliary_loss_mlp": 0.01027618, "balance_loss_clip": 1.04225755, "balance_loss_mlp": 1.01964319, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 2.1291851969369704, "language_loss": 0.70745528, "learning_rate": 8.454498771059797e-07, "loss": 0.72885573, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 2.6473634243011475 }, { "auxiliary_loss_clip": 0.01104552, "auxiliary_loss_mlp": 0.01032503, "balance_loss_clip": 1.04328847, "balance_loss_mlp": 1.0252161, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.2936549333823515, "language_loss": 0.83522642, "learning_rate": 8.448138955859725e-07, "loss": 0.85659707, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.664006471633911 }, { "auxiliary_loss_clip": 0.01139335, "auxiliary_loss_mlp": 0.01027594, "balance_loss_clip": 1.04485941, "balance_loss_mlp": 1.01991653, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 1.8896071052256271, "language_loss": 0.89798778, "learning_rate": 8.44178089306778e-07, "loss": 0.91965705, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.59967041015625 }, { "auxiliary_loss_clip": 0.01166187, "auxiliary_loss_mlp": 0.01026144, "balance_loss_clip": 1.04959488, "balance_loss_mlp": 1.0196712, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 1.803492143934694, "language_loss": 0.77018666, "learning_rate": 8.4354245836485e-07, "loss": 0.79210991, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.5396835803985596 }, { "auxiliary_loss_clip": 0.01127413, "auxiliary_loss_mlp": 0.0102902, "balance_loss_clip": 1.04627538, "balance_loss_mlp": 1.02160549, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 1.564754910459291, "language_loss": 0.72827452, "learning_rate": 8.429070028566108e-07, "loss": 0.74983883, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 2.6511738300323486 }, { "auxiliary_loss_clip": 0.01150754, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.04626513, "balance_loss_mlp": 1.0210973, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 1.9212349373759396, "language_loss": 0.74893123, "learning_rate": 8.422717228784586e-07, "loss": 0.77072191, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.5666069984436035 }, { "auxiliary_loss_clip": 0.01110105, "auxiliary_loss_mlp": 0.0102984, "balance_loss_clip": 1.04621458, "balance_loss_mlp": 1.02295864, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 1.9270579328052857, "language_loss": 0.69024694, "learning_rate": 8.416366185267663e-07, "loss": 0.71164644, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.6389591693878174 }, { "auxiliary_loss_clip": 0.01150769, "auxiliary_loss_mlp": 0.010239, "balance_loss_clip": 1.04458475, "balance_loss_mlp": 1.01701581, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 1.6130552783751424, "language_loss": 0.77723861, "learning_rate": 8.410016898978778e-07, "loss": 0.79898524, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.477135181427002 }, { "auxiliary_loss_clip": 0.01105404, "auxiliary_loss_mlp": 0.01025944, "balance_loss_clip": 1.0442102, "balance_loss_mlp": 1.01938462, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 2.0077918683130234, "language_loss": 0.78756577, "learning_rate": 8.403669370881115e-07, "loss": 0.80887926, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 2.633226156234741 }, { "auxiliary_loss_clip": 0.01167069, "auxiliary_loss_mlp": 0.01027259, "balance_loss_clip": 1.05010128, "balance_loss_mlp": 1.02012467, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 3.6625208808778695, "language_loss": 0.7836284, "learning_rate": 8.397323601937587e-07, "loss": 0.80557168, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 2.4632811546325684 }, { "auxiliary_loss_clip": 0.01122548, "auxiliary_loss_mlp": 0.01031255, "balance_loss_clip": 1.04441595, "balance_loss_mlp": 1.02435589, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 1.825884418619177, "language_loss": 0.77134728, "learning_rate": 8.390979593110838e-07, "loss": 0.7928853, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.6225132942199707 }, { "auxiliary_loss_clip": 0.01142449, "auxiliary_loss_mlp": 0.0102426, "balance_loss_clip": 1.04782617, "balance_loss_mlp": 1.01674366, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 1.586588707491305, "language_loss": 0.81548047, "learning_rate": 8.384637345363262e-07, "loss": 0.83714747, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 3.2509214878082275 }, { "auxiliary_loss_clip": 0.01130059, "auxiliary_loss_mlp": 0.01029183, "balance_loss_clip": 1.04152393, "balance_loss_mlp": 1.02175045, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 1.6568057335174635, "language_loss": 0.76717126, "learning_rate": 8.378296859656964e-07, "loss": 0.78876364, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.597907543182373 }, { "auxiliary_loss_clip": 0.01140407, "auxiliary_loss_mlp": 0.01024456, "balance_loss_clip": 1.04671383, "balance_loss_mlp": 1.01776838, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 2.224292461697898, "language_loss": 0.68294233, "learning_rate": 8.371958136953792e-07, "loss": 0.70459104, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.608757495880127 }, { "auxiliary_loss_clip": 0.0113325, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.04679561, "balance_loss_mlp": 1.02060556, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 2.6403830089911264, "language_loss": 0.66480088, "learning_rate": 8.365621178215326e-07, "loss": 0.6864202, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.525420904159546 }, { "auxiliary_loss_clip": 0.01145499, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.04472184, "balance_loss_mlp": 1.02000713, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.1651071930942702, "language_loss": 0.75574213, "learning_rate": 8.359285984402871e-07, "loss": 0.77747065, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.4760303497314453 }, { "auxiliary_loss_clip": 0.01134715, "auxiliary_loss_mlp": 0.01024874, "balance_loss_clip": 1.04662478, "balance_loss_mlp": 1.01799273, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 2.007078143413203, "language_loss": 0.74045736, "learning_rate": 8.352952556477489e-07, "loss": 0.76205325, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.5716190338134766 }, { "auxiliary_loss_clip": 0.01152091, "auxiliary_loss_mlp": 0.01027904, "balance_loss_clip": 1.04863739, "balance_loss_mlp": 1.02076387, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 1.6882053455512651, "language_loss": 0.76613593, "learning_rate": 8.34662089539993e-07, "loss": 0.78793585, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.523428201675415 }, { "auxiliary_loss_clip": 0.01165288, "auxiliary_loss_mlp": 0.01026084, "balance_loss_clip": 1.04916835, "balance_loss_mlp": 1.01946783, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 1.8739469475091177, "language_loss": 0.79157043, "learning_rate": 8.340291002130722e-07, "loss": 0.81348419, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 2.5144615173339844 }, { "auxiliary_loss_clip": 0.01169134, "auxiliary_loss_mlp": 0.0102818, "balance_loss_clip": 1.05018497, "balance_loss_mlp": 1.02085495, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 2.5417359573233664, "language_loss": 0.79184103, "learning_rate": 8.3339628776301e-07, "loss": 0.81381416, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 3.236354351043701 }, { "auxiliary_loss_clip": 0.01165931, "auxiliary_loss_mlp": 0.01023512, "balance_loss_clip": 1.05010927, "balance_loss_mlp": 1.01686597, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 1.7687816121496613, "language_loss": 0.57387298, "learning_rate": 8.327636522858033e-07, "loss": 0.59576744, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 2.5978281497955322 }, { "auxiliary_loss_clip": 0.01111164, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 1.04627419, "balance_loss_mlp": 1.02102673, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 1.9478741770793402, "language_loss": 0.76864421, "learning_rate": 8.321311938774225e-07, "loss": 0.79003513, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 3.3583285808563232 }, { "auxiliary_loss_clip": 0.01168751, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.04873252, "balance_loss_mlp": 1.02176762, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 1.9705229719053095, "language_loss": 0.79183394, "learning_rate": 8.314989126338104e-07, "loss": 0.81381726, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 3.249850273132324 }, { "auxiliary_loss_clip": 0.01155191, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.04792666, "balance_loss_mlp": 1.01994956, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 1.7257720075279035, "language_loss": 0.84774017, "learning_rate": 8.308668086508847e-07, "loss": 0.86956489, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 2.473177671432495 }, { "auxiliary_loss_clip": 0.01126009, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.04122162, "balance_loss_mlp": 1.02250338, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 1.8752824600779856, "language_loss": 0.73798913, "learning_rate": 8.302348820245342e-07, "loss": 0.75955176, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 2.7934060096740723 }, { "auxiliary_loss_clip": 0.01120533, "auxiliary_loss_mlp": 0.01035, "balance_loss_clip": 1.04377353, "balance_loss_mlp": 1.02703106, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 4.232421285298609, "language_loss": 0.69523096, "learning_rate": 8.296031328506232e-07, "loss": 0.71678632, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 2.593430280685425 }, { "auxiliary_loss_clip": 0.01140416, "auxiliary_loss_mlp": 0.01030484, "balance_loss_clip": 1.04712737, "balance_loss_mlp": 1.02300096, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 1.7566459584076268, "language_loss": 0.75501245, "learning_rate": 8.289715612249857e-07, "loss": 0.77672148, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 2.573807954788208 }, { "auxiliary_loss_clip": 0.0113584, "auxiliary_loss_mlp": 0.01026536, "balance_loss_clip": 1.04551697, "balance_loss_mlp": 1.01886451, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 2.5673152059384448, "language_loss": 0.77500725, "learning_rate": 8.283401672434305e-07, "loss": 0.79663104, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.4797518253326416 }, { "auxiliary_loss_clip": 0.01135789, "auxiliary_loss_mlp": 0.01027615, "balance_loss_clip": 1.04676378, "balance_loss_mlp": 1.02087116, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 4.034952064423403, "language_loss": 0.70197719, "learning_rate": 8.277089510017412e-07, "loss": 0.72361124, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.557799816131592 }, { "auxiliary_loss_clip": 0.0113713, "auxiliary_loss_mlp": 0.0102521, "balance_loss_clip": 1.04795945, "balance_loss_mlp": 1.01832604, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 1.7810892218867753, "language_loss": 0.81972742, "learning_rate": 8.270779125956719e-07, "loss": 0.84135079, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.522712469100952 }, { "auxiliary_loss_clip": 0.01103442, "auxiliary_loss_mlp": 0.01022167, "balance_loss_clip": 1.0419507, "balance_loss_mlp": 1.01548517, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 1.9280816115009511, "language_loss": 0.80005538, "learning_rate": 8.264470521209505e-07, "loss": 0.82131147, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.6111981868743896 }, { "auxiliary_loss_clip": 0.01143725, "auxiliary_loss_mlp": 0.0103226, "balance_loss_clip": 1.04499078, "balance_loss_mlp": 1.02443743, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 2.279620653419183, "language_loss": 0.76413298, "learning_rate": 8.258163696732785e-07, "loss": 0.78589284, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.471235990524292 }, { "auxiliary_loss_clip": 0.01149051, "auxiliary_loss_mlp": 0.0102864, "balance_loss_clip": 1.04768801, "balance_loss_mlp": 1.02136815, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 1.7940745003323173, "language_loss": 0.76772308, "learning_rate": 8.251858653483288e-07, "loss": 0.7895, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.5018742084503174 }, { "auxiliary_loss_clip": 0.01155287, "auxiliary_loss_mlp": 0.01030401, "balance_loss_clip": 1.05022836, "balance_loss_mlp": 1.02349854, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 3.7582560147478357, "language_loss": 0.85741544, "learning_rate": 8.245555392417501e-07, "loss": 0.87927222, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.4600350856781006 }, { "auxiliary_loss_clip": 0.01096327, "auxiliary_loss_mlp": 0.01031702, "balance_loss_clip": 1.0390799, "balance_loss_mlp": 1.02484775, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 1.6591802783538785, "language_loss": 0.78639156, "learning_rate": 8.239253914491613e-07, "loss": 0.80767184, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 2.6094937324523926 }, { "auxiliary_loss_clip": 0.01120189, "auxiliary_loss_mlp": 0.01028182, "balance_loss_clip": 1.04524326, "balance_loss_mlp": 1.02089548, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 1.9579554016668974, "language_loss": 0.75238913, "learning_rate": 8.232954220661556e-07, "loss": 0.77387285, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 2.6289472579956055 }, { "auxiliary_loss_clip": 0.01165934, "auxiliary_loss_mlp": 0.01026979, "balance_loss_clip": 1.05119538, "balance_loss_mlp": 1.01986575, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 2.4908415612928754, "language_loss": 0.70219332, "learning_rate": 8.226656311882989e-07, "loss": 0.72412246, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.488739490509033 }, { "auxiliary_loss_clip": 0.01152205, "auxiliary_loss_mlp": 0.01025088, "balance_loss_clip": 1.05134666, "balance_loss_mlp": 1.01845086, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.0266106489030857, "language_loss": 0.76892614, "learning_rate": 8.22036018911129e-07, "loss": 0.79069912, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 3.2956314086914062 }, { "auxiliary_loss_clip": 0.0116876, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 1.04858816, "balance_loss_mlp": 1.0211302, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 1.9254889812378222, "language_loss": 0.80742991, "learning_rate": 8.214065853301599e-07, "loss": 0.82940459, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.437049627304077 }, { "auxiliary_loss_clip": 0.01050315, "auxiliary_loss_mlp": 0.01001844, "balance_loss_clip": 1.01036167, "balance_loss_mlp": 1.00080061, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.810224270062796, "language_loss": 0.58287442, "learning_rate": 8.207773305408734e-07, "loss": 0.603396, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.2224206924438477 }, { "auxiliary_loss_clip": 0.01118983, "auxiliary_loss_mlp": 0.01030667, "balance_loss_clip": 1.04260814, "balance_loss_mlp": 1.02298975, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 2.073064510696055, "language_loss": 0.79268181, "learning_rate": 8.201482546387288e-07, "loss": 0.81417823, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.622485876083374 }, { "auxiliary_loss_clip": 0.01152732, "auxiliary_loss_mlp": 0.01028256, "balance_loss_clip": 1.04889321, "balance_loss_mlp": 1.02157712, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.7217694993614585, "language_loss": 0.92229736, "learning_rate": 8.195193577191553e-07, "loss": 0.94410729, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.557504177093506 }, { "auxiliary_loss_clip": 0.01141713, "auxiliary_loss_mlp": 0.00760709, "balance_loss_clip": 1.04554749, "balance_loss_mlp": 1.00022292, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 1.588097479331853, "language_loss": 0.84546775, "learning_rate": 8.188906398775579e-07, "loss": 0.86449194, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.5589025020599365 }, { "auxiliary_loss_clip": 0.0116653, "auxiliary_loss_mlp": 0.00761443, "balance_loss_clip": 1.04805875, "balance_loss_mlp": 1.0002284, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 1.9123917841672793, "language_loss": 0.68748665, "learning_rate": 8.18262101209311e-07, "loss": 0.70676637, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.535945177078247 }, { "auxiliary_loss_clip": 0.01151664, "auxiliary_loss_mlp": 0.0102945, "balance_loss_clip": 1.04690719, "balance_loss_mlp": 1.0222795, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 1.7374613624801922, "language_loss": 0.70201194, "learning_rate": 8.176337418097626e-07, "loss": 0.72382319, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 2.4988150596618652 }, { "auxiliary_loss_clip": 0.01153484, "auxiliary_loss_mlp": 0.00761152, "balance_loss_clip": 1.04916489, "balance_loss_mlp": 1.00019264, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 2.043163575821353, "language_loss": 0.79307842, "learning_rate": 8.170055617742364e-07, "loss": 0.81222486, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 3.2770326137542725 }, { "auxiliary_loss_clip": 0.01135218, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.04659891, "balance_loss_mlp": 1.02271724, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 1.8422044169648748, "language_loss": 0.70486486, "learning_rate": 8.163775611980252e-07, "loss": 0.72651827, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 2.550135374069214 }, { "auxiliary_loss_clip": 0.01141051, "auxiliary_loss_mlp": 0.01024339, "balance_loss_clip": 1.04752088, "balance_loss_mlp": 1.01705575, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 1.6568353374910332, "language_loss": 0.78700662, "learning_rate": 8.157497401763982e-07, "loss": 0.80866051, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 3.281853199005127 }, { "auxiliary_loss_clip": 0.01151433, "auxiliary_loss_mlp": 0.01029137, "balance_loss_clip": 1.0483067, "balance_loss_mlp": 1.02224684, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 1.9339660613300778, "language_loss": 0.77723074, "learning_rate": 8.151220988045935e-07, "loss": 0.79903644, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 3.2386882305145264 }, { "auxiliary_loss_clip": 0.01149101, "auxiliary_loss_mlp": 0.01026731, "balance_loss_clip": 1.04536915, "balance_loss_mlp": 1.01990056, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 1.7023936830519582, "language_loss": 0.82699609, "learning_rate": 8.144946371778234e-07, "loss": 0.84875441, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 2.495692253112793 }, { "auxiliary_loss_clip": 0.01136961, "auxiliary_loss_mlp": 0.00761111, "balance_loss_clip": 1.04496598, "balance_loss_mlp": 1.00021064, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 1.5881360834084037, "language_loss": 0.78092092, "learning_rate": 8.138673553912751e-07, "loss": 0.79990166, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.5643937587738037 }, { "auxiliary_loss_clip": 0.01108065, "auxiliary_loss_mlp": 0.01028318, "balance_loss_clip": 1.0439713, "balance_loss_mlp": 1.02111208, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 2.4338272594919914, "language_loss": 0.56729263, "learning_rate": 8.132402535401059e-07, "loss": 0.58865643, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 2.657670021057129 }, { "auxiliary_loss_clip": 0.01146857, "auxiliary_loss_mlp": 0.01021624, "balance_loss_clip": 1.04690993, "balance_loss_mlp": 1.01503456, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 1.732933656880314, "language_loss": 0.74288392, "learning_rate": 8.126133317194465e-07, "loss": 0.76456869, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 2.5464138984680176 }, { "auxiliary_loss_clip": 0.01098627, "auxiliary_loss_mlp": 0.01029175, "balance_loss_clip": 1.03845572, "balance_loss_mlp": 1.0221417, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 1.7189237815626226, "language_loss": 0.74213123, "learning_rate": 8.11986590024401e-07, "loss": 0.7634092, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.6436192989349365 }, { "auxiliary_loss_clip": 0.01137097, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.04483759, "balance_loss_mlp": 1.02417159, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 1.57880264053548, "language_loss": 0.68848467, "learning_rate": 8.113600285500442e-07, "loss": 0.71017015, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.6642587184906006 }, { "auxiliary_loss_clip": 0.01165992, "auxiliary_loss_mlp": 0.01023922, "balance_loss_clip": 1.04810941, "balance_loss_mlp": 1.01706731, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 1.7439420593869805, "language_loss": 0.7443608, "learning_rate": 8.107336473914268e-07, "loss": 0.76625991, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.473623514175415 }, { "auxiliary_loss_clip": 0.01038293, "auxiliary_loss_mlp": 0.01002865, "balance_loss_clip": 1.0107851, "balance_loss_mlp": 1.00183392, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7638180467868849, "language_loss": 0.55711603, "learning_rate": 8.101074466435694e-07, "loss": 0.57752764, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.082484483718872 }, { "auxiliary_loss_clip": 0.01147301, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.04678619, "balance_loss_mlp": 1.02748871, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 1.6869747409131655, "language_loss": 0.67543042, "learning_rate": 8.094814264014662e-07, "loss": 0.69724888, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.516599178314209 }, { "auxiliary_loss_clip": 0.0116952, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.04948604, "balance_loss_mlp": 1.02481723, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 2.0154238168501584, "language_loss": 0.81293356, "learning_rate": 8.088555867600844e-07, "loss": 0.83495456, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.4605178833007812 }, { "auxiliary_loss_clip": 0.01120271, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.04357076, "balance_loss_mlp": 1.02110195, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 1.723939092615646, "language_loss": 0.60584635, "learning_rate": 8.08229927814362e-07, "loss": 0.62732846, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.7043328285217285 }, { "auxiliary_loss_clip": 0.0112516, "auxiliary_loss_mlp": 0.01026741, "balance_loss_clip": 1.04441893, "balance_loss_mlp": 1.01988363, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.6811045328903538, "language_loss": 0.64586228, "learning_rate": 8.076044496592134e-07, "loss": 0.66738129, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 2.6040077209472656 }, { "auxiliary_loss_clip": 0.01140287, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.04900861, "balance_loss_mlp": 1.02018738, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 2.3999773515420952, "language_loss": 0.77970093, "learning_rate": 8.069791523895204e-07, "loss": 0.80137885, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 2.5028090476989746 }, { "auxiliary_loss_clip": 0.01113194, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.03968239, "balance_loss_mlp": 1.02234983, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 1.7376676489009597, "language_loss": 0.77334929, "learning_rate": 8.063540361001422e-07, "loss": 0.7947771, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 3.2933363914489746 }, { "auxiliary_loss_clip": 0.01119282, "auxiliary_loss_mlp": 0.01033282, "balance_loss_clip": 1.04349422, "balance_loss_mlp": 1.02559614, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 2.063504550278858, "language_loss": 0.79362738, "learning_rate": 8.057291008859069e-07, "loss": 0.815153, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.5502047538757324 }, { "auxiliary_loss_clip": 0.01145814, "auxiliary_loss_mlp": 0.01027031, "balance_loss_clip": 1.04355788, "balance_loss_mlp": 1.02010453, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 1.8990206572128372, "language_loss": 0.68124866, "learning_rate": 8.051043468416187e-07, "loss": 0.70297718, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.5870728492736816 }, { "auxiliary_loss_clip": 0.0116586, "auxiliary_loss_mlp": 0.01028979, "balance_loss_clip": 1.04990995, "balance_loss_mlp": 1.02182972, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 1.9006594967857438, "language_loss": 0.82457882, "learning_rate": 8.044797740620506e-07, "loss": 0.84652722, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.4612295627593994 }, { "auxiliary_loss_clip": 0.01105955, "auxiliary_loss_mlp": 0.01028286, "balance_loss_clip": 1.0436573, "balance_loss_mlp": 1.02137184, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 1.9216664242861543, "language_loss": 0.78547251, "learning_rate": 8.038553826419494e-07, "loss": 0.80681497, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.6117544174194336 }, { "auxiliary_loss_clip": 0.01164476, "auxiliary_loss_mlp": 0.01029951, "balance_loss_clip": 1.04661107, "balance_loss_mlp": 1.0231626, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.6108016464088828, "language_loss": 0.81147122, "learning_rate": 8.032311726760364e-07, "loss": 0.83341545, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.4909331798553467 }, { "auxiliary_loss_clip": 0.01116278, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.0428648, "balance_loss_mlp": 1.02091324, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 1.87788207443658, "language_loss": 0.68661767, "learning_rate": 8.026071442590022e-07, "loss": 0.70806205, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 2.9639930725097656 }, { "auxiliary_loss_clip": 0.01153221, "auxiliary_loss_mlp": 0.01028275, "balance_loss_clip": 1.05070615, "balance_loss_mlp": 1.02150989, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 1.8416012499941172, "language_loss": 0.80782521, "learning_rate": 8.019832974855134e-07, "loss": 0.82964015, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.4698448181152344 }, { "auxiliary_loss_clip": 0.0112397, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 1.04510272, "balance_loss_mlp": 1.02385831, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 2.151551917725363, "language_loss": 0.82066029, "learning_rate": 8.013596324502052e-07, "loss": 0.84221208, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 2.5817081928253174 }, { "auxiliary_loss_clip": 0.01148767, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.04798806, "balance_loss_mlp": 1.0214746, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 1.882250517348598, "language_loss": 0.78458548, "learning_rate": 8.007361492476872e-07, "loss": 0.80635178, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 3.307887554168701 }, { "auxiliary_loss_clip": 0.01131457, "auxiliary_loss_mlp": 0.01030951, "balance_loss_clip": 1.04405177, "balance_loss_mlp": 1.02403104, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 1.4774560854308052, "language_loss": 0.79130387, "learning_rate": 8.001128479725426e-07, "loss": 0.81292796, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 2.589768171310425 }, { "auxiliary_loss_clip": 0.01101787, "auxiliary_loss_mlp": 0.01024259, "balance_loss_clip": 1.04019928, "balance_loss_mlp": 1.01767313, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.5378703662022588, "language_loss": 0.81004906, "learning_rate": 7.994897287193248e-07, "loss": 0.83130956, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 3.4010837078094482 }, { "auxiliary_loss_clip": 0.01156084, "auxiliary_loss_mlp": 0.01024332, "balance_loss_clip": 1.04831707, "balance_loss_mlp": 1.01734352, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 2.2941994844673266, "language_loss": 0.83325589, "learning_rate": 7.988667915825605e-07, "loss": 0.85505998, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 3.272545337677002 }, { "auxiliary_loss_clip": 0.0114038, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 1.04767585, "balance_loss_mlp": 1.02181995, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 1.9754028582420127, "language_loss": 0.75904524, "learning_rate": 7.982440366567491e-07, "loss": 0.78074181, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.5348894596099854 }, { "auxiliary_loss_clip": 0.01145227, "auxiliary_loss_mlp": 0.01023071, "balance_loss_clip": 1.04463315, "balance_loss_mlp": 1.0165503, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 1.6306928679278025, "language_loss": 0.75448823, "learning_rate": 7.97621464036361e-07, "loss": 0.77617121, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 2.5549774169921875 }, { "auxiliary_loss_clip": 0.0115485, "auxiliary_loss_mlp": 0.01027339, "balance_loss_clip": 1.04658544, "balance_loss_mlp": 1.02046096, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 1.5000406976633862, "language_loss": 0.68013501, "learning_rate": 7.969990738158417e-07, "loss": 0.70195687, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 2.5013041496276855 }, { "auxiliary_loss_clip": 0.01154024, "auxiliary_loss_mlp": 0.01027212, "balance_loss_clip": 1.04889512, "balance_loss_mlp": 1.01928496, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 1.9148934217850202, "language_loss": 0.84998155, "learning_rate": 7.963768660896062e-07, "loss": 0.87179393, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 2.498347043991089 }, { "auxiliary_loss_clip": 0.0115637, "auxiliary_loss_mlp": 0.01028059, "balance_loss_clip": 1.0469079, "balance_loss_mlp": 1.02088308, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 1.8310598865926073, "language_loss": 0.82487202, "learning_rate": 7.957548409520432e-07, "loss": 0.84671628, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.6304848194122314 }, { "auxiliary_loss_clip": 0.01125136, "auxiliary_loss_mlp": 0.01025997, "balance_loss_clip": 1.04417872, "balance_loss_mlp": 1.01885891, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 1.8495694538886032, "language_loss": 0.839849, "learning_rate": 7.951329984975135e-07, "loss": 0.86136031, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.5400516986846924 }, { "auxiliary_loss_clip": 0.01029281, "auxiliary_loss_mlp": 0.01002755, "balance_loss_clip": 1.01126611, "balance_loss_mlp": 1.00172997, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.7098325189050411, "language_loss": 0.54323924, "learning_rate": 7.94511338820349e-07, "loss": 0.56355965, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.15289306640625 }, { "auxiliary_loss_clip": 0.01137449, "auxiliary_loss_mlp": 0.00761356, "balance_loss_clip": 1.04642975, "balance_loss_mlp": 1.00017262, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 1.97482872786692, "language_loss": 0.78072196, "learning_rate": 7.938898620148575e-07, "loss": 0.79970998, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.5303831100463867 }, { "auxiliary_loss_clip": 0.01139795, "auxiliary_loss_mlp": 0.0102701, "balance_loss_clip": 1.04668951, "balance_loss_mlp": 1.01984906, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 2.115003691328379, "language_loss": 0.70841235, "learning_rate": 7.932685681753135e-07, "loss": 0.73008037, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 2.4923274517059326 }, { "auxiliary_loss_clip": 0.01163364, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.04920411, "balance_loss_mlp": 1.02137518, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 2.0422377501032947, "language_loss": 0.62757593, "learning_rate": 7.92647457395969e-07, "loss": 0.64948994, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.5573573112487793 }, { "auxiliary_loss_clip": 0.01100034, "auxiliary_loss_mlp": 0.01027153, "balance_loss_clip": 1.03776455, "balance_loss_mlp": 1.01941645, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.3360823123161483, "language_loss": 0.73830962, "learning_rate": 7.920265297710444e-07, "loss": 0.75958157, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.5832557678222656 }, { "auxiliary_loss_clip": 0.0115484, "auxiliary_loss_mlp": 0.01020854, "balance_loss_clip": 1.04858661, "balance_loss_mlp": 1.01392758, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 1.7613992082882095, "language_loss": 0.73099625, "learning_rate": 7.914057853947363e-07, "loss": 0.7527532, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.4927306175231934 }, { "auxiliary_loss_clip": 0.01121981, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.04368639, "balance_loss_mlp": 1.02838516, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 1.7709549146715946, "language_loss": 0.62599266, "learning_rate": 7.907852243612089e-07, "loss": 0.64756733, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 2.5997438430786133 }, { "auxiliary_loss_clip": 0.01133954, "auxiliary_loss_mlp": 0.01023595, "balance_loss_clip": 1.04398155, "balance_loss_mlp": 1.01676702, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 1.7528237280657342, "language_loss": 0.72195303, "learning_rate": 7.901648467646009e-07, "loss": 0.74352849, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 3.32944393157959 }, { "auxiliary_loss_clip": 0.01169616, "auxiliary_loss_mlp": 0.01030894, "balance_loss_clip": 1.05052423, "balance_loss_mlp": 1.02303767, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 1.5475573810004368, "language_loss": 0.72447568, "learning_rate": 7.895446526990244e-07, "loss": 0.74648082, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.484856367111206 }, { "auxiliary_loss_clip": 0.0112103, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.04514003, "balance_loss_mlp": 1.019642, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 1.5989460613176696, "language_loss": 0.75720429, "learning_rate": 7.889246422585609e-07, "loss": 0.77868551, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.5970053672790527 }, { "auxiliary_loss_clip": 0.01167155, "auxiliary_loss_mlp": 0.01027885, "balance_loss_clip": 1.04919517, "balance_loss_mlp": 1.02087235, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 2.166469838879143, "language_loss": 0.73461264, "learning_rate": 7.883048155372675e-07, "loss": 0.75656295, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.477022886276245 }, { "auxiliary_loss_clip": 0.01144354, "auxiliary_loss_mlp": 0.01033002, "balance_loss_clip": 1.04769862, "balance_loss_mlp": 1.02625775, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 2.0631318537498884, "language_loss": 0.71939278, "learning_rate": 7.876851726291698e-07, "loss": 0.74116635, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.5116283893585205 }, { "auxiliary_loss_clip": 0.01127623, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.04342711, "balance_loss_mlp": 1.02300525, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 2.019206919823149, "language_loss": 0.7818051, "learning_rate": 7.870657136282666e-07, "loss": 0.80337918, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.611811399459839 }, { "auxiliary_loss_clip": 0.01145871, "auxiliary_loss_mlp": 0.01027309, "balance_loss_clip": 1.04491138, "balance_loss_mlp": 1.02070785, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 1.4915529432765657, "language_loss": 0.81651485, "learning_rate": 7.86446438628531e-07, "loss": 0.83824676, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.5431430339813232 }, { "auxiliary_loss_clip": 0.010592, "auxiliary_loss_mlp": 0.01001543, "balance_loss_clip": 1.0109427, "balance_loss_mlp": 1.00049388, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7711958125089252, "language_loss": 0.56909561, "learning_rate": 7.858273477239059e-07, "loss": 0.58970296, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.100001573562622 }, { "auxiliary_loss_clip": 0.01096805, "auxiliary_loss_mlp": 0.01028381, "balance_loss_clip": 1.0400064, "balance_loss_mlp": 1.02065003, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 1.5987328237543916, "language_loss": 0.71456599, "learning_rate": 7.852084410083067e-07, "loss": 0.73581785, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 2.6132917404174805 }, { "auxiliary_loss_clip": 0.01132234, "auxiliary_loss_mlp": 0.01023943, "balance_loss_clip": 1.04473448, "balance_loss_mlp": 1.01753569, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 1.5475383480146827, "language_loss": 0.63806546, "learning_rate": 7.84589718575621e-07, "loss": 0.65962732, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 3.3493032455444336 }, { "auxiliary_loss_clip": 0.01136965, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 1.04041028, "balance_loss_mlp": 1.0183568, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 2.0998701163098366, "language_loss": 0.68744814, "learning_rate": 7.83971180519708e-07, "loss": 0.70907307, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 2.54312801361084 }, { "auxiliary_loss_clip": 0.01168481, "auxiliary_loss_mlp": 0.01027123, "balance_loss_clip": 1.04997444, "balance_loss_mlp": 1.01968455, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 1.9233664046993462, "language_loss": 0.75718504, "learning_rate": 7.833528269344008e-07, "loss": 0.77914107, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 3.3426408767700195 }, { "auxiliary_loss_clip": 0.01121338, "auxiliary_loss_mlp": 0.01030428, "balance_loss_clip": 1.04475904, "balance_loss_mlp": 1.02331746, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 1.920163904192582, "language_loss": 0.77427518, "learning_rate": 7.827346579135023e-07, "loss": 0.79579282, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 2.5482499599456787 }, { "auxiliary_loss_clip": 0.01135901, "auxiliary_loss_mlp": 0.01027895, "balance_loss_clip": 1.04419482, "balance_loss_mlp": 1.02091813, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 1.7767441886995217, "language_loss": 0.83184975, "learning_rate": 7.821166735507885e-07, "loss": 0.85348767, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 3.2305281162261963 }, { "auxiliary_loss_clip": 0.01166009, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.04971135, "balance_loss_mlp": 1.02120066, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 2.1807081698456594, "language_loss": 0.68563569, "learning_rate": 7.81498873940007e-07, "loss": 0.70756966, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 2.446850061416626 }, { "auxiliary_loss_clip": 0.01158793, "auxiliary_loss_mlp": 0.01029435, "balance_loss_clip": 1.04665017, "balance_loss_mlp": 1.02211547, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 2.4254811945634285, "language_loss": 0.77381849, "learning_rate": 7.808812591748768e-07, "loss": 0.79570079, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 2.5213825702667236 }, { "auxiliary_loss_clip": 0.01123133, "auxiliary_loss_mlp": 0.0102729, "balance_loss_clip": 1.04392886, "balance_loss_mlp": 1.01937711, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 1.9202687117584822, "language_loss": 0.64687777, "learning_rate": 7.802638293490915e-07, "loss": 0.66838205, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.5792033672332764 }, { "auxiliary_loss_clip": 0.01143251, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 1.04709029, "balance_loss_mlp": 1.02245855, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 1.5970908042288992, "language_loss": 0.76645488, "learning_rate": 7.796465845563123e-07, "loss": 0.78818053, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.5741922855377197 }, { "auxiliary_loss_clip": 0.01133917, "auxiliary_loss_mlp": 0.00760815, "balance_loss_clip": 1.04559457, "balance_loss_mlp": 1.0002172, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 1.954043488595539, "language_loss": 0.79639053, "learning_rate": 7.790295248901766e-07, "loss": 0.81533778, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.5781195163726807 }, { "auxiliary_loss_clip": 0.0115151, "auxiliary_loss_mlp": 0.01031501, "balance_loss_clip": 1.04627442, "balance_loss_mlp": 1.02453589, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 2.11669043422757, "language_loss": 0.62434196, "learning_rate": 7.784126504442902e-07, "loss": 0.64617217, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.596994638442993 }, { "auxiliary_loss_clip": 0.01118317, "auxiliary_loss_mlp": 0.01027869, "balance_loss_clip": 1.04361343, "balance_loss_mlp": 1.02071619, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.4660849362495478, "language_loss": 0.67999989, "learning_rate": 7.777959613122351e-07, "loss": 0.70146173, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.5574264526367188 }, { "auxiliary_loss_clip": 0.01133471, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.04718101, "balance_loss_mlp": 1.02324462, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.8824534603329977, "language_loss": 0.78103983, "learning_rate": 7.771794575875604e-07, "loss": 0.8026762, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 2.5651705265045166 }, { "auxiliary_loss_clip": 0.01150719, "auxiliary_loss_mlp": 0.01031112, "balance_loss_clip": 1.04836011, "balance_loss_mlp": 1.02349436, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 2.1714685551824986, "language_loss": 0.77323079, "learning_rate": 7.765631393637888e-07, "loss": 0.79504919, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.4847867488861084 }, { "auxiliary_loss_clip": 0.01148721, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.04578793, "balance_loss_mlp": 1.0178169, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 2.7080029078864825, "language_loss": 0.47919869, "learning_rate": 7.75947006734417e-07, "loss": 0.50093937, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.4924046993255615 }, { "auxiliary_loss_clip": 0.01164504, "auxiliary_loss_mlp": 0.01020579, "balance_loss_clip": 1.04609609, "balance_loss_mlp": 1.01358724, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 1.992307425901427, "language_loss": 0.82479584, "learning_rate": 7.753310597929101e-07, "loss": 0.84664667, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.4324145317077637 }, { "auxiliary_loss_clip": 0.01058256, "auxiliary_loss_mlp": 0.01001963, "balance_loss_clip": 1.01003563, "balance_loss_mlp": 1.00093794, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.8558992715316296, "language_loss": 0.55145454, "learning_rate": 7.747152986327095e-07, "loss": 0.57205677, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 2.9796600341796875 }, { "auxiliary_loss_clip": 0.01114681, "auxiliary_loss_mlp": 0.01027579, "balance_loss_clip": 1.04304028, "balance_loss_mlp": 1.02083206, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 1.7621209743810458, "language_loss": 0.67889071, "learning_rate": 7.740997233472228e-07, "loss": 0.70031333, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 3.2995247840881348 }, { "auxiliary_loss_clip": 0.011367, "auxiliary_loss_mlp": 0.01026052, "balance_loss_clip": 1.04439139, "balance_loss_mlp": 1.01951647, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 2.460371751591185, "language_loss": 0.70806354, "learning_rate": 7.734843340298329e-07, "loss": 0.72969103, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.587094306945801 }, { "auxiliary_loss_clip": 0.0114331, "auxiliary_loss_mlp": 0.01028658, "balance_loss_clip": 1.04464126, "balance_loss_mlp": 1.02108204, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 1.935060334699244, "language_loss": 0.75292885, "learning_rate": 7.72869130773895e-07, "loss": 0.77464861, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.6138110160827637 }, { "auxiliary_loss_clip": 0.01049899, "auxiliary_loss_mlp": 0.01001106, "balance_loss_clip": 1.01013923, "balance_loss_mlp": 1.00008106, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.7890391625262524, "language_loss": 0.59398305, "learning_rate": 7.722541136727343e-07, "loss": 0.61449313, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 2.964895009994507 }, { "auxiliary_loss_clip": 0.01151414, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.04654908, "balance_loss_mlp": 1.01714456, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 2.042866559041551, "language_loss": 0.80497921, "learning_rate": 7.716392828196483e-07, "loss": 0.82673228, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.4552013874053955 }, { "auxiliary_loss_clip": 0.01153344, "auxiliary_loss_mlp": 0.01036069, "balance_loss_clip": 1.04867101, "balance_loss_mlp": 1.02874327, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 2.7445882801800643, "language_loss": 0.77061582, "learning_rate": 7.710246383079064e-07, "loss": 0.79250991, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.453763246536255 }, { "auxiliary_loss_clip": 0.01136849, "auxiliary_loss_mlp": 0.0103055, "balance_loss_clip": 1.04074848, "balance_loss_mlp": 1.02352834, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 3.179183341636985, "language_loss": 0.92354667, "learning_rate": 7.704101802307492e-07, "loss": 0.94522077, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 2.4911720752716064 }, { "auxiliary_loss_clip": 0.01117308, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.04234815, "balance_loss_mlp": 1.02581882, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 1.9066371765817658, "language_loss": 0.87085879, "learning_rate": 7.697959086813912e-07, "loss": 0.89236516, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.6022090911865234 }, { "auxiliary_loss_clip": 0.01116817, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.04229879, "balance_loss_mlp": 1.02233291, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.5787767181035346, "language_loss": 0.80093139, "learning_rate": 7.691818237530145e-07, "loss": 0.82239282, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 2.540581464767456 }, { "auxiliary_loss_clip": 0.01123047, "auxiliary_loss_mlp": 0.0102596, "balance_loss_clip": 1.04407859, "balance_loss_mlp": 1.01884937, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 1.9547924409232116, "language_loss": 0.77168894, "learning_rate": 7.685679255387774e-07, "loss": 0.79317898, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 3.4047093391418457 }, { "auxiliary_loss_clip": 0.01137237, "auxiliary_loss_mlp": 0.01025666, "balance_loss_clip": 1.04645061, "balance_loss_mlp": 1.01857948, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 1.9472946920517729, "language_loss": 0.77162278, "learning_rate": 7.679542141318065e-07, "loss": 0.79325175, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 2.5110974311828613 }, { "auxiliary_loss_clip": 0.0112412, "auxiliary_loss_mlp": 0.01026425, "balance_loss_clip": 1.0413729, "balance_loss_mlp": 1.01898086, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 1.64610599489961, "language_loss": 0.75758183, "learning_rate": 7.673406896252013e-07, "loss": 0.77908719, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 3.2978878021240234 }, { "auxiliary_loss_clip": 0.01122255, "auxiliary_loss_mlp": 0.01033808, "balance_loss_clip": 1.04157925, "balance_loss_mlp": 1.02597046, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.544570902879685, "language_loss": 0.78397924, "learning_rate": 7.667273521120347e-07, "loss": 0.80553985, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 3.3561501502990723 }, { "auxiliary_loss_clip": 0.01127706, "auxiliary_loss_mlp": 0.01026405, "balance_loss_clip": 1.04457736, "balance_loss_mlp": 1.019467, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 1.906997415890972, "language_loss": 0.79654211, "learning_rate": 7.661142016853468e-07, "loss": 0.81808323, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.542182683944702 }, { "auxiliary_loss_clip": 0.01108869, "auxiliary_loss_mlp": 0.01027541, "balance_loss_clip": 1.04210103, "balance_loss_mlp": 1.0211277, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 1.803878803807844, "language_loss": 0.74676669, "learning_rate": 7.655012384381543e-07, "loss": 0.7681309, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.600414276123047 }, { "auxiliary_loss_clip": 0.01136658, "auxiliary_loss_mlp": 0.01023366, "balance_loss_clip": 1.04790568, "balance_loss_mlp": 1.01627362, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 1.654421799121267, "language_loss": 0.81728899, "learning_rate": 7.648884624634415e-07, "loss": 0.83888918, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 2.5252840518951416 }, { "auxiliary_loss_clip": 0.01153992, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 1.04908311, "balance_loss_mlp": 1.02142549, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 2.2749561700646144, "language_loss": 0.88694954, "learning_rate": 7.642758738541683e-07, "loss": 0.90877938, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.4685490131378174 }, { "auxiliary_loss_clip": 0.01048931, "auxiliary_loss_mlp": 0.01003398, "balance_loss_clip": 1.01074028, "balance_loss_mlp": 1.00237834, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 0.7589792289607421, "language_loss": 0.60787463, "learning_rate": 7.636634727032621e-07, "loss": 0.62839782, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 2.94522762298584 }, { "auxiliary_loss_clip": 0.01122915, "auxiliary_loss_mlp": 0.01028469, "balance_loss_clip": 1.0393281, "balance_loss_mlp": 1.02021432, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 2.0816263235062755, "language_loss": 0.79029691, "learning_rate": 7.630512591036231e-07, "loss": 0.81181073, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.5494720935821533 }, { "auxiliary_loss_clip": 0.01153538, "auxiliary_loss_mlp": 0.01025231, "balance_loss_clip": 1.04793477, "balance_loss_mlp": 1.01792967, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 2.1457538059649743, "language_loss": 0.64419222, "learning_rate": 7.624392331481255e-07, "loss": 0.66597986, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.4707024097442627 }, { "auxiliary_loss_clip": 0.01046765, "auxiliary_loss_mlp": 0.01004465, "balance_loss_clip": 1.00899899, "balance_loss_mlp": 1.00348747, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.7456020188067483, "language_loss": 0.51846313, "learning_rate": 7.618273949296115e-07, "loss": 0.53897536, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.03424072265625 }, { "auxiliary_loss_clip": 0.01133685, "auxiliary_loss_mlp": 0.01027222, "balance_loss_clip": 1.04328966, "balance_loss_mlp": 1.01956296, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 2.024964884046603, "language_loss": 0.68511081, "learning_rate": 7.612157445408987e-07, "loss": 0.70671982, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.53128981590271 }, { "auxiliary_loss_clip": 0.01142928, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.04928899, "balance_loss_mlp": 1.02144885, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.004500326225906, "language_loss": 0.74225789, "learning_rate": 7.606042820747716e-07, "loss": 0.7639811, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.5275938510894775 }, { "auxiliary_loss_clip": 0.01148283, "auxiliary_loss_mlp": 0.01026605, "balance_loss_clip": 1.05031347, "balance_loss_mlp": 1.01929426, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 1.7301633239825274, "language_loss": 0.8543936, "learning_rate": 7.599930076239889e-07, "loss": 0.87614244, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.509868621826172 }, { "auxiliary_loss_clip": 0.01117807, "auxiliary_loss_mlp": 0.0076106, "balance_loss_clip": 1.04697335, "balance_loss_mlp": 1.00023389, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 1.8180902292505796, "language_loss": 0.70284384, "learning_rate": 7.593819212812818e-07, "loss": 0.72163254, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.705749750137329 }, { "auxiliary_loss_clip": 0.01151, "auxiliary_loss_mlp": 0.01026555, "balance_loss_clip": 1.04693985, "balance_loss_mlp": 1.01961732, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 1.849893755158785, "language_loss": 0.71806359, "learning_rate": 7.587710231393508e-07, "loss": 0.73983914, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 3.2011187076568604 }, { "auxiliary_loss_clip": 0.01072191, "auxiliary_loss_mlp": 0.0102419, "balance_loss_clip": 1.03705049, "balance_loss_mlp": 1.01697826, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 2.1288458115800295, "language_loss": 0.83858395, "learning_rate": 7.581603132908685e-07, "loss": 0.85954785, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.632154941558838 }, { "auxiliary_loss_clip": 0.0111815, "auxiliary_loss_mlp": 0.01024852, "balance_loss_clip": 1.04318392, "balance_loss_mlp": 1.01737487, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 1.9200366139786254, "language_loss": 0.78292811, "learning_rate": 7.575497918284795e-07, "loss": 0.80435818, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.5318708419799805 }, { "auxiliary_loss_clip": 0.01169518, "auxiliary_loss_mlp": 0.01031485, "balance_loss_clip": 1.04848695, "balance_loss_mlp": 1.02420092, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 2.0176781157168002, "language_loss": 0.74242133, "learning_rate": 7.569394588447984e-07, "loss": 0.76443136, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.431175947189331 }, { "auxiliary_loss_clip": 0.01142515, "auxiliary_loss_mlp": 0.01028107, "balance_loss_clip": 1.04310751, "balance_loss_mlp": 1.02090406, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 2.287071371331218, "language_loss": 0.7841208, "learning_rate": 7.563293144324146e-07, "loss": 0.80582702, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.4634854793548584 }, { "auxiliary_loss_clip": 0.01166682, "auxiliary_loss_mlp": 0.01027733, "balance_loss_clip": 1.05065835, "balance_loss_mlp": 1.02130795, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 1.7570543867430142, "language_loss": 0.79853022, "learning_rate": 7.557193586838834e-07, "loss": 0.82047439, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.489490509033203 }, { "auxiliary_loss_clip": 0.01136348, "auxiliary_loss_mlp": 0.01024888, "balance_loss_clip": 1.04478419, "balance_loss_mlp": 1.0177834, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.099593664387454, "language_loss": 0.70667857, "learning_rate": 7.551095916917371e-07, "loss": 0.72829092, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.4936330318450928 }, { "auxiliary_loss_clip": 0.01132923, "auxiliary_loss_mlp": 0.01029077, "balance_loss_clip": 1.04550099, "balance_loss_mlp": 1.0207206, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 2.461661262484918, "language_loss": 0.66439593, "learning_rate": 7.545000135484758e-07, "loss": 0.68601596, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 2.5281214714050293 }, { "auxiliary_loss_clip": 0.01168894, "auxiliary_loss_mlp": 0.00761656, "balance_loss_clip": 1.05031466, "balance_loss_mlp": 1.00020063, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 1.9290035644865262, "language_loss": 0.62604046, "learning_rate": 7.538906243465714e-07, "loss": 0.64534593, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.5306835174560547 }, { "auxiliary_loss_clip": 0.01169003, "auxiliary_loss_mlp": 0.01025207, "balance_loss_clip": 1.0506382, "balance_loss_mlp": 1.01785755, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 2.034806767747672, "language_loss": 0.78797686, "learning_rate": 7.5328142417847e-07, "loss": 0.809919, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 2.4336512088775635 }, { "auxiliary_loss_clip": 0.01149093, "auxiliary_loss_mlp": 0.01024613, "balance_loss_clip": 1.04431927, "balance_loss_mlp": 1.01763034, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 2.0577349814152677, "language_loss": 0.69263887, "learning_rate": 7.526724131365838e-07, "loss": 0.71437597, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 3.3039486408233643 }, { "auxiliary_loss_clip": 0.01139371, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.04902732, "balance_loss_mlp": 1.02012396, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 1.6119477145600274, "language_loss": 0.70297396, "learning_rate": 7.520635913133017e-07, "loss": 0.72464359, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 2.496561050415039 }, { "auxiliary_loss_clip": 0.0115807, "auxiliary_loss_mlp": 0.01027345, "balance_loss_clip": 1.04805458, "balance_loss_mlp": 1.01944709, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 1.7615499992057173, "language_loss": 0.82696795, "learning_rate": 7.514549588009798e-07, "loss": 0.84882212, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 3.344890594482422 }, { "auxiliary_loss_clip": 0.01142728, "auxiliary_loss_mlp": 0.01025604, "balance_loss_clip": 1.04637384, "balance_loss_mlp": 1.01786089, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 1.7954453465708753, "language_loss": 0.70570242, "learning_rate": 7.508465156919492e-07, "loss": 0.72738576, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 2.6120429039001465 }, { "auxiliary_loss_clip": 0.01139889, "auxiliary_loss_mlp": 0.01029244, "balance_loss_clip": 1.04601681, "balance_loss_mlp": 1.02192426, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 2.549608226830489, "language_loss": 0.61169297, "learning_rate": 7.502382620785083e-07, "loss": 0.63338429, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 3.4682233333587646 }, { "auxiliary_loss_clip": 0.01023723, "auxiliary_loss_mlp": 0.01001365, "balance_loss_clip": 1.01146555, "balance_loss_mlp": 1.00023305, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.8205871340081141, "language_loss": 0.6255067, "learning_rate": 7.496301980529289e-07, "loss": 0.64575756, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.219339370727539 }, { "auxiliary_loss_clip": 0.01166516, "auxiliary_loss_mlp": 0.01026965, "balance_loss_clip": 1.04797769, "balance_loss_mlp": 1.02004457, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 2.0803004880534615, "language_loss": 0.74327517, "learning_rate": 7.490223237074547e-07, "loss": 0.76521003, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 2.517594337463379 }, { "auxiliary_loss_clip": 0.01122731, "auxiliary_loss_mlp": 0.01029973, "balance_loss_clip": 1.04195142, "balance_loss_mlp": 1.02290392, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 1.7449495140480127, "language_loss": 0.66046876, "learning_rate": 7.484146391342989e-07, "loss": 0.68199581, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 2.610764265060425 }, { "auxiliary_loss_clip": 0.01131366, "auxiliary_loss_mlp": 0.01028335, "balance_loss_clip": 1.04410815, "balance_loss_mlp": 1.02138853, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 2.1947883400193002, "language_loss": 0.56805044, "learning_rate": 7.478071444256484e-07, "loss": 0.58964741, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.4936747550964355 }, { "auxiliary_loss_clip": 0.01130958, "auxiliary_loss_mlp": 0.01027316, "balance_loss_clip": 1.04434514, "balance_loss_mlp": 1.01992202, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 1.8123824945681781, "language_loss": 0.79572105, "learning_rate": 7.471998396736579e-07, "loss": 0.8173039, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.5847086906433105 }, { "auxiliary_loss_clip": 0.01126142, "auxiliary_loss_mlp": 0.01024487, "balance_loss_clip": 1.04520226, "balance_loss_mlp": 1.01736152, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.6412704474877562, "language_loss": 0.75915045, "learning_rate": 7.465927249704549e-07, "loss": 0.78065675, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.574049949645996 }, { "auxiliary_loss_clip": 0.01151842, "auxiliary_loss_mlp": 0.0102607, "balance_loss_clip": 1.046875, "balance_loss_mlp": 1.01904225, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 1.6504415999141175, "language_loss": 0.77165931, "learning_rate": 7.459858004081398e-07, "loss": 0.79343843, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.4821877479553223 }, { "auxiliary_loss_clip": 0.01020224, "auxiliary_loss_mlp": 0.01000878, "balance_loss_clip": 1.00808764, "balance_loss_mlp": 0.99973965, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 0.6533093154779064, "language_loss": 0.58022368, "learning_rate": 7.453790660787815e-07, "loss": 0.60043466, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.2357540130615234 }, { "auxiliary_loss_clip": 0.01140657, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.04651582, "balance_loss_mlp": 1.01870251, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 1.9671603326116, "language_loss": 0.63376701, "learning_rate": 7.447725220744214e-07, "loss": 0.65543884, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.630401134490967 }, { "auxiliary_loss_clip": 0.01168221, "auxiliary_loss_mlp": 0.01030065, "balance_loss_clip": 1.04930341, "balance_loss_mlp": 1.02239108, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 1.9229967721616303, "language_loss": 0.77247822, "learning_rate": 7.441661684870717e-07, "loss": 0.79446107, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.4945008754730225 }, { "auxiliary_loss_clip": 0.01168937, "auxiliary_loss_mlp": 0.01029937, "balance_loss_clip": 1.05044246, "balance_loss_mlp": 1.02271259, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 1.6799633139673031, "language_loss": 0.81964236, "learning_rate": 7.435600054087152e-07, "loss": 0.84163111, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.500795364379883 }, { "auxiliary_loss_clip": 0.01170098, "auxiliary_loss_mlp": 0.01027071, "balance_loss_clip": 1.05189323, "balance_loss_mlp": 1.01960564, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 2.164655833649386, "language_loss": 0.74499679, "learning_rate": 7.42954032931308e-07, "loss": 0.76696849, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 3.333434581756592 }, { "auxiliary_loss_clip": 0.01141854, "auxiliary_loss_mlp": 0.01026436, "balance_loss_clip": 1.04630029, "balance_loss_mlp": 1.01937294, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 2.976625337344399, "language_loss": 0.74740255, "learning_rate": 7.423482511467733e-07, "loss": 0.76908547, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.6452536582946777 }, { "auxiliary_loss_clip": 0.01085339, "auxiliary_loss_mlp": 0.01027052, "balance_loss_clip": 1.03978872, "balance_loss_mlp": 1.0198729, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 2.3323351708798485, "language_loss": 0.64472157, "learning_rate": 7.417426601470099e-07, "loss": 0.66584551, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.6629509925842285 }, { "auxiliary_loss_clip": 0.01154973, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.04701805, "balance_loss_mlp": 1.02167284, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.208859703485093, "language_loss": 0.78734636, "learning_rate": 7.411372600238841e-07, "loss": 0.80919194, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.5671136379241943 }, { "auxiliary_loss_clip": 0.01166115, "auxiliary_loss_mlp": 0.01026083, "balance_loss_clip": 1.04766846, "balance_loss_mlp": 1.0189184, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 2.0314062574551772, "language_loss": 0.7385596, "learning_rate": 7.405320508692346e-07, "loss": 0.76048154, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.4470086097717285 }, { "auxiliary_loss_clip": 0.01163912, "auxiliary_loss_mlp": 0.01028705, "balance_loss_clip": 1.04892159, "balance_loss_mlp": 1.02201152, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 1.8565080144584087, "language_loss": 0.7557106, "learning_rate": 7.399270327748727e-07, "loss": 0.77763677, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.449816942214966 }, { "auxiliary_loss_clip": 0.01123251, "auxiliary_loss_mlp": 0.00760186, "balance_loss_clip": 1.04069805, "balance_loss_mlp": 1.0002085, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 1.7901941083950736, "language_loss": 0.74011099, "learning_rate": 7.39322205832577e-07, "loss": 0.75894535, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.610888957977295 }, { "auxiliary_loss_clip": 0.01134157, "auxiliary_loss_mlp": 0.01024969, "balance_loss_clip": 1.04488277, "balance_loss_mlp": 1.01835012, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 1.775278754911872, "language_loss": 0.80799842, "learning_rate": 7.387175701341009e-07, "loss": 0.82958966, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 2.517200231552124 }, { "auxiliary_loss_clip": 0.01151753, "auxiliary_loss_mlp": 0.01027477, "balance_loss_clip": 1.04591179, "balance_loss_mlp": 1.01983595, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 2.5984919137778824, "language_loss": 0.72092247, "learning_rate": 7.381131257711659e-07, "loss": 0.74271476, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.4815425872802734 }, { "auxiliary_loss_clip": 0.01134178, "auxiliary_loss_mlp": 0.01030831, "balance_loss_clip": 1.04675698, "balance_loss_mlp": 1.02391076, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.8171268548371624, "language_loss": 0.83217037, "learning_rate": 7.375088728354677e-07, "loss": 0.85382044, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 2.486875057220459 }, { "auxiliary_loss_clip": 0.01125631, "auxiliary_loss_mlp": 0.01023891, "balance_loss_clip": 1.04246855, "balance_loss_mlp": 1.01644921, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 2.283523733435723, "language_loss": 0.67796856, "learning_rate": 7.369048114186691e-07, "loss": 0.69946373, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 2.642566442489624 }, { "auxiliary_loss_clip": 0.01129788, "auxiliary_loss_mlp": 0.00760327, "balance_loss_clip": 1.04547513, "balance_loss_mlp": 1.00021636, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 1.8309073878649382, "language_loss": 0.83058393, "learning_rate": 7.363009416124055e-07, "loss": 0.84948504, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 3.3588573932647705 }, { "auxiliary_loss_clip": 0.01129129, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.04660368, "balance_loss_mlp": 1.02254975, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 2.2862288491978533, "language_loss": 0.63154906, "learning_rate": 7.356972635082852e-07, "loss": 0.65314233, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 3.2535743713378906 }, { "auxiliary_loss_clip": 0.01108854, "auxiliary_loss_mlp": 0.01028775, "balance_loss_clip": 1.04587424, "balance_loss_mlp": 1.0211519, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 1.7271910863739415, "language_loss": 0.75217485, "learning_rate": 7.35093777197884e-07, "loss": 0.77355123, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.630213737487793 }, { "auxiliary_loss_clip": 0.01139674, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.04908001, "balance_loss_mlp": 1.01832414, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 1.9623093678696535, "language_loss": 0.85724187, "learning_rate": 7.344904827727525e-07, "loss": 0.87889069, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 3.2667293548583984 }, { "auxiliary_loss_clip": 0.01125952, "auxiliary_loss_mlp": 0.01027588, "balance_loss_clip": 1.04347134, "balance_loss_mlp": 1.02037537, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 2.7224732507511247, "language_loss": 0.73279637, "learning_rate": 7.338873803244076e-07, "loss": 0.75433177, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 2.600013017654419 }, { "auxiliary_loss_clip": 0.01134971, "auxiliary_loss_mlp": 0.01022707, "balance_loss_clip": 1.04539371, "balance_loss_mlp": 1.01490486, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.9306921239607409, "language_loss": 0.80971575, "learning_rate": 7.332844699443401e-07, "loss": 0.83129251, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 2.557462453842163 }, { "auxiliary_loss_clip": 0.01103195, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.04051208, "balance_loss_mlp": 1.02252841, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 1.8687207169122715, "language_loss": 0.75291967, "learning_rate": 7.326817517240121e-07, "loss": 0.77424157, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.6452417373657227 }, { "auxiliary_loss_clip": 0.01152167, "auxiliary_loss_mlp": 0.00760747, "balance_loss_clip": 1.04616225, "balance_loss_mlp": 1.00023389, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 1.6679077356258816, "language_loss": 0.83194202, "learning_rate": 7.320792257548545e-07, "loss": 0.85107112, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.6381494998931885 }, { "auxiliary_loss_clip": 0.01145042, "auxiliary_loss_mlp": 0.01029803, "balance_loss_clip": 1.04747057, "balance_loss_mlp": 1.02217913, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 3.261507694159518, "language_loss": 0.76324701, "learning_rate": 7.314768921282704e-07, "loss": 0.7849955, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.551164150238037 }, { "auxiliary_loss_clip": 0.01154774, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.04811943, "balance_loss_mlp": 1.0253917, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 2.738044261923142, "language_loss": 0.72350335, "learning_rate": 7.30874750935633e-07, "loss": 0.74537778, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.520831346511841 }, { "auxiliary_loss_clip": 0.01127022, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.04828668, "balance_loss_mlp": 1.0211432, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 1.8919683898677593, "language_loss": 0.79001284, "learning_rate": 7.30272802268286e-07, "loss": 0.81156301, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.5397629737854004 }, { "auxiliary_loss_clip": 0.01072968, "auxiliary_loss_mlp": 0.01025283, "balance_loss_clip": 1.03816688, "balance_loss_mlp": 1.01865172, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 1.6692677653600763, "language_loss": 0.76507086, "learning_rate": 7.29671046217547e-07, "loss": 0.78605336, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.666482448577881 }, { "auxiliary_loss_clip": 0.01128033, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.04581451, "balance_loss_mlp": 1.02193475, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.6733152034477852, "language_loss": 0.81395847, "learning_rate": 7.290694828746988e-07, "loss": 0.83552325, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.6247212886810303 }, { "auxiliary_loss_clip": 0.01129499, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.04485178, "balance_loss_mlp": 1.02156413, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 2.1554092415129147, "language_loss": 0.85999918, "learning_rate": 7.284681123310004e-07, "loss": 0.88157475, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.5638842582702637 }, { "auxiliary_loss_clip": 0.01151485, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.04738545, "balance_loss_mlp": 1.02142107, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 1.551524885540593, "language_loss": 0.79448247, "learning_rate": 7.27866934677678e-07, "loss": 0.81628799, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.4973134994506836 }, { "auxiliary_loss_clip": 0.01109092, "auxiliary_loss_mlp": 0.01027679, "balance_loss_clip": 1.04529428, "balance_loss_mlp": 1.02077341, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 1.8067615048472734, "language_loss": 0.78274691, "learning_rate": 7.272659500059297e-07, "loss": 0.80411458, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 3.289480686187744 }, { "auxiliary_loss_clip": 0.01148155, "auxiliary_loss_mlp": 0.01030759, "balance_loss_clip": 1.04780746, "balance_loss_mlp": 1.02301002, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 1.9340363824057272, "language_loss": 0.79844505, "learning_rate": 7.266651584069264e-07, "loss": 0.82023418, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.4690845012664795 }, { "auxiliary_loss_clip": 0.01157032, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.05006242, "balance_loss_mlp": 1.02351141, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 1.602151876923802, "language_loss": 0.57119906, "learning_rate": 7.260645599718045e-07, "loss": 0.59307504, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.6359200477600098 }, { "auxiliary_loss_clip": 0.0114002, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.04550982, "balance_loss_mlp": 1.02484095, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 1.9072519677912556, "language_loss": 0.67138922, "learning_rate": 7.254641547916767e-07, "loss": 0.69311917, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.5202393531799316 }, { "auxiliary_loss_clip": 0.01169513, "auxiliary_loss_mlp": 0.01022981, "balance_loss_clip": 1.05249345, "balance_loss_mlp": 1.01568866, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 1.7216179796725373, "language_loss": 0.69285572, "learning_rate": 7.248639429576226e-07, "loss": 0.71478069, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.5236005783081055 }, { "auxiliary_loss_clip": 0.0115501, "auxiliary_loss_mlp": 0.01027595, "balance_loss_clip": 1.04877877, "balance_loss_mlp": 1.0206449, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 1.581711271931304, "language_loss": 0.71976328, "learning_rate": 7.242639245606959e-07, "loss": 0.74158931, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.5250959396362305 }, { "auxiliary_loss_clip": 0.01141606, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.043872, "balance_loss_mlp": 1.02400589, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 1.6761927280456423, "language_loss": 0.82019222, "learning_rate": 7.236640996919168e-07, "loss": 0.84192979, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.517779588699341 }, { "auxiliary_loss_clip": 0.0115598, "auxiliary_loss_mlp": 0.01026752, "balance_loss_clip": 1.04875445, "balance_loss_mlp": 1.01970971, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.569949506168332, "language_loss": 0.70726645, "learning_rate": 7.230644684422782e-07, "loss": 0.72909379, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 2.4982268810272217 }, { "auxiliary_loss_clip": 0.01124308, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.04603958, "balance_loss_mlp": 1.02074611, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 1.7420495394179374, "language_loss": 0.81803882, "learning_rate": 7.224650309027451e-07, "loss": 0.83956218, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.5880842208862305 }, { "auxiliary_loss_clip": 0.01156234, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.04856205, "balance_loss_mlp": 1.02418923, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 1.74189197124845, "language_loss": 0.68199974, "learning_rate": 7.218657871642506e-07, "loss": 0.70387214, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 2.4788947105407715 }, { "auxiliary_loss_clip": 0.01170357, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.05062675, "balance_loss_mlp": 1.02191937, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 2.1037413182366196, "language_loss": 0.62487286, "learning_rate": 7.212667373177012e-07, "loss": 0.64687085, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.438267946243286 }, { "auxiliary_loss_clip": 0.01126276, "auxiliary_loss_mlp": 0.01024794, "balance_loss_clip": 1.04621923, "balance_loss_mlp": 1.01846755, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 1.7392754319997927, "language_loss": 0.75077534, "learning_rate": 7.206678814539704e-07, "loss": 0.77228606, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 3.352139949798584 }, { "auxiliary_loss_clip": 0.01120325, "auxiliary_loss_mlp": 0.01030134, "balance_loss_clip": 1.04414856, "balance_loss_mlp": 1.0233953, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 1.485201661390943, "language_loss": 0.72779918, "learning_rate": 7.20069219663904e-07, "loss": 0.74930376, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 3.4020602703094482 }, { "auxiliary_loss_clip": 0.01153146, "auxiliary_loss_mlp": 0.01025279, "balance_loss_clip": 1.04468048, "balance_loss_mlp": 1.01769161, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 2.2006849493544616, "language_loss": 0.79642832, "learning_rate": 7.1947075203832e-07, "loss": 0.81821251, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 2.5171258449554443 }, { "auxiliary_loss_clip": 0.01059016, "auxiliary_loss_mlp": 0.01002697, "balance_loss_clip": 1.01084137, "balance_loss_mlp": 1.00165367, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.8656823009127745, "language_loss": 0.60120118, "learning_rate": 7.188724786680049e-07, "loss": 0.6218183, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.7370970249176025 }, { "auxiliary_loss_clip": 0.01135465, "auxiliary_loss_mlp": 0.01025322, "balance_loss_clip": 1.04350579, "balance_loss_mlp": 1.01813388, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.6512731070546949, "language_loss": 0.75458741, "learning_rate": 7.182743996437162e-07, "loss": 0.77619529, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 2.5837912559509277 }, { "auxiliary_loss_clip": 0.01134217, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.04493988, "balance_loss_mlp": 1.02552509, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 1.876076242062857, "language_loss": 0.68731463, "learning_rate": 7.176765150561819e-07, "loss": 0.70898688, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 2.613851308822632 }, { "auxiliary_loss_clip": 0.01167972, "auxiliary_loss_mlp": 0.01025095, "balance_loss_clip": 1.04835176, "balance_loss_mlp": 1.01792789, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 1.9003474517007866, "language_loss": 0.80088556, "learning_rate": 7.170788249961002e-07, "loss": 0.82281625, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.4640073776245117 }, { "auxiliary_loss_clip": 0.0116304, "auxiliary_loss_mlp": 0.01024543, "balance_loss_clip": 1.04634547, "balance_loss_mlp": 1.01751804, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 2.093803216761503, "language_loss": 0.88236481, "learning_rate": 7.164813295541418e-07, "loss": 0.90424061, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.501742124557495 }, { "auxiliary_loss_clip": 0.01139901, "auxiliary_loss_mlp": 0.01027813, "balance_loss_clip": 1.04554272, "balance_loss_mlp": 1.02060628, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.578016737955804, "language_loss": 0.70346665, "learning_rate": 7.15884028820944e-07, "loss": 0.72514385, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.5824508666992188 }, { "auxiliary_loss_clip": 0.01122664, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.04293001, "balance_loss_mlp": 1.02036715, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 2.2224399866999125, "language_loss": 0.60103613, "learning_rate": 7.152869228871185e-07, "loss": 0.62253821, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.6101465225219727 }, { "auxiliary_loss_clip": 0.01133903, "auxiliary_loss_mlp": 0.01026408, "balance_loss_clip": 1.04468322, "balance_loss_mlp": 1.01888573, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 2.3794897439644065, "language_loss": 0.72398841, "learning_rate": 7.146900118432457e-07, "loss": 0.74559152, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 2.5549445152282715 }, { "auxiliary_loss_clip": 0.01078753, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.035043, "balance_loss_mlp": 1.02003002, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 1.8074785753540203, "language_loss": 0.86017978, "learning_rate": 7.140932957798753e-07, "loss": 0.88123393, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.6902780532836914 }, { "auxiliary_loss_clip": 0.01140369, "auxiliary_loss_mlp": 0.01029781, "balance_loss_clip": 1.04320097, "balance_loss_mlp": 1.02279258, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 1.77651439416414, "language_loss": 0.71268487, "learning_rate": 7.134967747875309e-07, "loss": 0.73438644, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 2.523655414581299 }, { "auxiliary_loss_clip": 0.01149107, "auxiliary_loss_mlp": 0.0102838, "balance_loss_clip": 1.04644644, "balance_loss_mlp": 1.02120996, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 1.8614000522154395, "language_loss": 0.8179034, "learning_rate": 7.129004489567014e-07, "loss": 0.83967823, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.51029896736145 }, { "auxiliary_loss_clip": 0.01128551, "auxiliary_loss_mlp": 0.01025632, "balance_loss_clip": 1.04581022, "balance_loss_mlp": 1.01853871, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 4.181653702548048, "language_loss": 0.77513433, "learning_rate": 7.123043183778512e-07, "loss": 0.79667616, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.5309746265411377 }, { "auxiliary_loss_clip": 0.01132328, "auxiliary_loss_mlp": 0.0102367, "balance_loss_clip": 1.04799151, "balance_loss_mlp": 1.01638007, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 1.5873961004848585, "language_loss": 0.65257543, "learning_rate": 7.117083831414114e-07, "loss": 0.67413539, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 3.2726364135742188 }, { "auxiliary_loss_clip": 0.01162459, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04686964, "balance_loss_mlp": 1.01833916, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 1.7651006069856285, "language_loss": 0.6995213, "learning_rate": 7.11112643337787e-07, "loss": 0.72140169, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.4463605880737305 }, { "auxiliary_loss_clip": 0.0113791, "auxiliary_loss_mlp": 0.01029256, "balance_loss_clip": 1.04699802, "balance_loss_mlp": 1.0214895, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 2.267652427622491, "language_loss": 0.76538515, "learning_rate": 7.10517099057349e-07, "loss": 0.7870568, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.505002975463867 }, { "auxiliary_loss_clip": 0.0113964, "auxiliary_loss_mlp": 0.01028629, "balance_loss_clip": 1.04692566, "balance_loss_mlp": 1.02145314, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 2.5891969681082214, "language_loss": 0.61765552, "learning_rate": 7.099217503904411e-07, "loss": 0.6393382, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.484445810317993 }, { "auxiliary_loss_clip": 0.01143485, "auxiliary_loss_mlp": 0.01023909, "balance_loss_clip": 1.04685545, "balance_loss_mlp": 1.01732898, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 1.9508200031578633, "language_loss": 0.89885527, "learning_rate": 7.093265974273788e-07, "loss": 0.92052925, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.5266106128692627 }, { "auxiliary_loss_clip": 0.01154151, "auxiliary_loss_mlp": 0.01022451, "balance_loss_clip": 1.04659629, "balance_loss_mlp": 1.01555145, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 1.7209563171198137, "language_loss": 0.71540642, "learning_rate": 7.087316402584447e-07, "loss": 0.73717242, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.458700656890869 }, { "auxiliary_loss_clip": 0.01167599, "auxiliary_loss_mlp": 0.01023496, "balance_loss_clip": 1.04874802, "balance_loss_mlp": 1.01665902, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 1.7456933791010956, "language_loss": 0.86258221, "learning_rate": 7.081368789738953e-07, "loss": 0.88449311, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.44570255279541 }, { "auxiliary_loss_clip": 0.01131242, "auxiliary_loss_mlp": 0.01026153, "balance_loss_clip": 1.04089141, "balance_loss_mlp": 1.01900065, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 1.840393467214699, "language_loss": 0.77695543, "learning_rate": 7.075423136639537e-07, "loss": 0.79852939, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 2.5537731647491455 }, { "auxiliary_loss_clip": 0.01118515, "auxiliary_loss_mlp": 0.01029132, "balance_loss_clip": 1.04332769, "balance_loss_mlp": 1.02183032, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 1.7627458311487707, "language_loss": 0.74613822, "learning_rate": 7.069479444188149e-07, "loss": 0.76761472, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.7087104320526123 }, { "auxiliary_loss_clip": 0.01131889, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.04540253, "balance_loss_mlp": 1.02313399, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 1.736095684632061, "language_loss": 0.82084078, "learning_rate": 7.063537713286453e-07, "loss": 0.84246385, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 2.5029029846191406 }, { "auxiliary_loss_clip": 0.0114518, "auxiliary_loss_mlp": 0.01024006, "balance_loss_clip": 1.04617226, "balance_loss_mlp": 1.01735973, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 2.254304303864522, "language_loss": 0.80627823, "learning_rate": 7.057597944835803e-07, "loss": 0.82797003, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 3.333479166030884 }, { "auxiliary_loss_clip": 0.01130771, "auxiliary_loss_mlp": 0.01027771, "balance_loss_clip": 1.04361403, "balance_loss_mlp": 1.02031755, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 1.6122531804171425, "language_loss": 0.7463448, "learning_rate": 7.051660139737253e-07, "loss": 0.76793021, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.620601177215576 }, { "auxiliary_loss_clip": 0.01152967, "auxiliary_loss_mlp": 0.00760879, "balance_loss_clip": 1.05138087, "balance_loss_mlp": 1.00018883, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 1.838032849785397, "language_loss": 0.76427937, "learning_rate": 7.045724298891565e-07, "loss": 0.78341782, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 2.5493738651275635 }, { "auxiliary_loss_clip": 0.01153626, "auxiliary_loss_mlp": 0.01025371, "balance_loss_clip": 1.04853892, "balance_loss_mlp": 1.01868916, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 1.8421568869812313, "language_loss": 0.69293666, "learning_rate": 7.039790423199192e-07, "loss": 0.71472657, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 3.296217679977417 }, { "auxiliary_loss_clip": 0.01142883, "auxiliary_loss_mlp": 0.01025588, "balance_loss_clip": 1.04747057, "balance_loss_mlp": 1.01784253, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 2.163429367163025, "language_loss": 0.77776885, "learning_rate": 7.033858513560322e-07, "loss": 0.79945362, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 3.4726510047912598 }, { "auxiliary_loss_clip": 0.0115489, "auxiliary_loss_mlp": 0.01026436, "balance_loss_clip": 1.04924572, "balance_loss_mlp": 1.01977491, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 2.102136994530873, "language_loss": 0.76402855, "learning_rate": 7.027928570874794e-07, "loss": 0.78584182, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 2.4772212505340576 }, { "auxiliary_loss_clip": 0.01164155, "auxiliary_loss_mlp": 0.01030154, "balance_loss_clip": 1.04807448, "balance_loss_mlp": 1.02312326, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 1.740348312831524, "language_loss": 0.85184395, "learning_rate": 7.022000596042194e-07, "loss": 0.87378705, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 2.4233646392822266 }, { "auxiliary_loss_clip": 0.01125351, "auxiliary_loss_mlp": 0.01023097, "balance_loss_clip": 1.04130638, "balance_loss_mlp": 1.01605725, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 1.9381121811645123, "language_loss": 0.82091022, "learning_rate": 7.016074589961784e-07, "loss": 0.84239471, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.577152729034424 }, { "auxiliary_loss_clip": 0.01134063, "auxiliary_loss_mlp": 0.0102794, "balance_loss_clip": 1.04571903, "balance_loss_mlp": 1.02103508, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 1.6834614939276833, "language_loss": 0.66902936, "learning_rate": 7.01015055353253e-07, "loss": 0.69064939, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.6287682056427 }, { "auxiliary_loss_clip": 0.01099402, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.0417732, "balance_loss_mlp": 1.02168417, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 1.6997175676599443, "language_loss": 0.77777505, "learning_rate": 7.004228487653123e-07, "loss": 0.79905963, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.6396737098693848 }, { "auxiliary_loss_clip": 0.01118303, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.0381602, "balance_loss_mlp": 1.02138114, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 1.9438831773745733, "language_loss": 0.78194797, "learning_rate": 6.998308393221906e-07, "loss": 0.8034184, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.553419828414917 }, { "auxiliary_loss_clip": 0.01128322, "auxiliary_loss_mlp": 0.01025269, "balance_loss_clip": 1.04496217, "balance_loss_mlp": 1.01821148, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.64116979001144, "language_loss": 0.71212769, "learning_rate": 6.992390271136977e-07, "loss": 0.73366356, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.5550544261932373 }, { "auxiliary_loss_clip": 0.01145346, "auxiliary_loss_mlp": 0.01025325, "balance_loss_clip": 1.04429901, "balance_loss_mlp": 1.01840472, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 1.860957948995464, "language_loss": 0.85585046, "learning_rate": 6.986474122296094e-07, "loss": 0.87755716, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.5047428607940674 }, { "auxiliary_loss_clip": 0.01168182, "auxiliary_loss_mlp": 0.01029299, "balance_loss_clip": 1.04960012, "balance_loss_mlp": 1.02163076, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 1.9199562712764588, "language_loss": 0.72106981, "learning_rate": 6.980559947596751e-07, "loss": 0.74304467, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.457738161087036 }, { "auxiliary_loss_clip": 0.01113439, "auxiliary_loss_mlp": 0.01024777, "balance_loss_clip": 1.04402709, "balance_loss_mlp": 1.01740074, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 1.9895494919822285, "language_loss": 0.75913978, "learning_rate": 6.974647747936109e-07, "loss": 0.78052193, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.5988969802856445 }, { "auxiliary_loss_clip": 0.01164911, "auxiliary_loss_mlp": 0.00760503, "balance_loss_clip": 1.04791963, "balance_loss_mlp": 1.00024211, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 2.74359589120304, "language_loss": 0.82417929, "learning_rate": 6.968737524211039e-07, "loss": 0.8434335, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.4375219345092773 }, { "auxiliary_loss_clip": 0.01152913, "auxiliary_loss_mlp": 0.01026398, "balance_loss_clip": 1.04902148, "balance_loss_mlp": 1.01918542, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 2.013593794440417, "language_loss": 0.80205262, "learning_rate": 6.962829277318132e-07, "loss": 0.82384568, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 3.3168511390686035 }, { "auxiliary_loss_clip": 0.01152585, "auxiliary_loss_mlp": 0.01023129, "balance_loss_clip": 1.04878974, "balance_loss_mlp": 1.01645315, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 1.7690332967528735, "language_loss": 0.83631134, "learning_rate": 6.956923008153652e-07, "loss": 0.85806847, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.5339887142181396 }, { "auxiliary_loss_clip": 0.01153777, "auxiliary_loss_mlp": 0.01024858, "balance_loss_clip": 1.04611635, "balance_loss_mlp": 1.01816761, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 2.0704005920191753, "language_loss": 0.84147125, "learning_rate": 6.951018717613593e-07, "loss": 0.86325759, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.495081901550293 }, { "auxiliary_loss_clip": 0.01151879, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.04825377, "balance_loss_mlp": 1.02327561, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 1.7753675698479794, "language_loss": 0.78368527, "learning_rate": 6.945116406593614e-07, "loss": 0.80551189, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.472273588180542 }, { "auxiliary_loss_clip": 0.01111536, "auxiliary_loss_mlp": 0.01026256, "balance_loss_clip": 1.04466367, "balance_loss_mlp": 1.01930928, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 2.1000969550896516, "language_loss": 0.74290729, "learning_rate": 6.939216075989089e-07, "loss": 0.76428521, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.591343879699707 }, { "auxiliary_loss_clip": 0.011378, "auxiliary_loss_mlp": 0.01022161, "balance_loss_clip": 1.04563785, "balance_loss_mlp": 1.01499951, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 1.7423111879945414, "language_loss": 0.66055018, "learning_rate": 6.933317726695109e-07, "loss": 0.68214977, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.582185983657837 }, { "auxiliary_loss_clip": 0.01121567, "auxiliary_loss_mlp": 0.0103122, "balance_loss_clip": 1.04546785, "balance_loss_mlp": 1.02468419, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 2.8508615732905347, "language_loss": 0.79471958, "learning_rate": 6.92742135960644e-07, "loss": 0.8162474, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.5187957286834717 }, { "auxiliary_loss_clip": 0.01051452, "auxiliary_loss_mlp": 0.01003704, "balance_loss_clip": 1.01223016, "balance_loss_mlp": 1.00255322, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 0.8122942830397575, "language_loss": 0.55747706, "learning_rate": 6.921526975617556e-07, "loss": 0.57802862, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 3.1154322624206543 }, { "auxiliary_loss_clip": 0.01138935, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.04534197, "balance_loss_mlp": 1.02522242, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 1.876554133146853, "language_loss": 0.753497, "learning_rate": 6.915634575622631e-07, "loss": 0.77521372, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.535419225692749 }, { "auxiliary_loss_clip": 0.01164288, "auxiliary_loss_mlp": 0.01027599, "balance_loss_clip": 1.0469507, "balance_loss_mlp": 1.0207206, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 1.7238567552660033, "language_loss": 0.70783466, "learning_rate": 6.909744160515532e-07, "loss": 0.72975355, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 2.4288628101348877 }, { "auxiliary_loss_clip": 0.01138295, "auxiliary_loss_mlp": 0.01025943, "balance_loss_clip": 1.0467844, "balance_loss_mlp": 1.01888919, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 1.9522917318525363, "language_loss": 0.69428176, "learning_rate": 6.903855731189849e-07, "loss": 0.71592414, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 2.687075138092041 }, { "auxiliary_loss_clip": 0.01146592, "auxiliary_loss_mlp": 0.01033281, "balance_loss_clip": 1.04791164, "balance_loss_mlp": 1.02569962, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 2.1567325284536127, "language_loss": 0.81925863, "learning_rate": 6.897969288538825e-07, "loss": 0.84105736, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 3.2744300365448 }, { "auxiliary_loss_clip": 0.01134461, "auxiliary_loss_mlp": 0.01026816, "balance_loss_clip": 1.04419601, "balance_loss_mlp": 1.02014959, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 1.5929627322204443, "language_loss": 0.81296486, "learning_rate": 6.892084833455452e-07, "loss": 0.83457768, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 2.5064010620117188 }, { "auxiliary_loss_clip": 0.01150812, "auxiliary_loss_mlp": 0.01023778, "balance_loss_clip": 1.04759479, "balance_loss_mlp": 1.01707506, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 1.433064373283806, "language_loss": 0.83858848, "learning_rate": 6.886202366832384e-07, "loss": 0.8603344, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 3.969604969024658 }, { "auxiliary_loss_clip": 0.01107655, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.04322886, "balance_loss_mlp": 1.02080035, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 1.7917890606995348, "language_loss": 0.73691416, "learning_rate": 6.880321889561987e-07, "loss": 0.75827277, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.558135509490967 }, { "auxiliary_loss_clip": 0.01119059, "auxiliary_loss_mlp": 0.01027369, "balance_loss_clip": 1.04197633, "balance_loss_mlp": 1.01953959, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 1.9773859572101127, "language_loss": 0.65461171, "learning_rate": 6.874443402536338e-07, "loss": 0.67607605, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 2.5719122886657715 }, { "auxiliary_loss_clip": 0.01141796, "auxiliary_loss_mlp": 0.01024, "balance_loss_clip": 1.04685044, "balance_loss_mlp": 1.01654315, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 1.6605722970797028, "language_loss": 0.80656463, "learning_rate": 6.868566906647177e-07, "loss": 0.82822263, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 2.551800489425659 }, { "auxiliary_loss_clip": 0.01151533, "auxiliary_loss_mlp": 0.01031928, "balance_loss_clip": 1.04578876, "balance_loss_mlp": 1.02465057, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 2.103444210531492, "language_loss": 0.83497679, "learning_rate": 6.862692402785984e-07, "loss": 0.85681146, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.51865291595459 }, { "auxiliary_loss_clip": 0.01022022, "auxiliary_loss_mlp": 0.01001658, "balance_loss_clip": 1.01061964, "balance_loss_mlp": 1.00063312, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.677095504234691, "language_loss": 0.49643466, "learning_rate": 6.856819891843899e-07, "loss": 0.51667142, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.2114145755767822 }, { "auxiliary_loss_clip": 0.01098423, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.04441833, "balance_loss_mlp": 1.02333939, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 2.0757941262500985, "language_loss": 0.71915007, "learning_rate": 6.8509493747118e-07, "loss": 0.74043679, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.6145901679992676 }, { "auxiliary_loss_clip": 0.01165569, "auxiliary_loss_mlp": 0.01027095, "balance_loss_clip": 1.04850543, "balance_loss_mlp": 1.020226, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 2.5559394421903106, "language_loss": 0.88145477, "learning_rate": 6.845080852280221e-07, "loss": 0.90338135, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.4200439453125 }, { "auxiliary_loss_clip": 0.01123622, "auxiliary_loss_mlp": 0.01027826, "balance_loss_clip": 1.04285884, "balance_loss_mlp": 1.02083731, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 1.6873112870122144, "language_loss": 0.74205244, "learning_rate": 6.839214325439409e-07, "loss": 0.76356685, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.549854040145874 }, { "auxiliary_loss_clip": 0.01135827, "auxiliary_loss_mlp": 0.01025809, "balance_loss_clip": 1.04903531, "balance_loss_mlp": 1.01878214, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 1.6432480560067482, "language_loss": 0.71636748, "learning_rate": 6.833349795079327e-07, "loss": 0.73798382, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.5685598850250244 }, { "auxiliary_loss_clip": 0.01126544, "auxiliary_loss_mlp": 0.01022974, "balance_loss_clip": 1.04784942, "balance_loss_mlp": 1.01630688, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 1.8348415970836915, "language_loss": 0.69116414, "learning_rate": 6.827487262089613e-07, "loss": 0.7126593, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.617788553237915 }, { "auxiliary_loss_clip": 0.01036349, "auxiliary_loss_mlp": 0.01000504, "balance_loss_clip": 1.01138496, "balance_loss_mlp": 0.99941331, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.9937897978138823, "language_loss": 0.56776977, "learning_rate": 6.821626727359606e-07, "loss": 0.58813828, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.182039499282837 }, { "auxiliary_loss_clip": 0.01134605, "auxiliary_loss_mlp": 0.0102789, "balance_loss_clip": 1.04698098, "balance_loss_mlp": 1.02032018, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 2.3629747551010922, "language_loss": 0.77015388, "learning_rate": 6.815768191778348e-07, "loss": 0.7917788, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.4929380416870117 }, { "auxiliary_loss_clip": 0.01146193, "auxiliary_loss_mlp": 0.01028099, "balance_loss_clip": 1.04428244, "balance_loss_mlp": 1.02055931, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 1.9128193068857957, "language_loss": 0.72810936, "learning_rate": 6.809911656234569e-07, "loss": 0.74985224, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 3.344209909439087 }, { "auxiliary_loss_clip": 0.0112622, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.04263628, "balance_loss_mlp": 1.02374637, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 2.004544754553179, "language_loss": 0.78270507, "learning_rate": 6.804057121616707e-07, "loss": 0.80427706, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.547750949859619 }, { "auxiliary_loss_clip": 0.01153512, "auxiliary_loss_mlp": 0.01024653, "balance_loss_clip": 1.04721165, "balance_loss_mlp": 1.01759911, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 1.9061578368619438, "language_loss": 0.71894169, "learning_rate": 6.798204588812888e-07, "loss": 0.74072337, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.5295207500457764 }, { "auxiliary_loss_clip": 0.01088549, "auxiliary_loss_mlp": 0.00760957, "balance_loss_clip": 1.03970265, "balance_loss_mlp": 1.00020766, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.6768866306767447, "language_loss": 0.75310707, "learning_rate": 6.792354058710937e-07, "loss": 0.77160209, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.6354610919952393 }, { "auxiliary_loss_clip": 0.01159292, "auxiliary_loss_mlp": 0.01019925, "balance_loss_clip": 1.04634428, "balance_loss_mlp": 1.01336837, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 1.90204413442809, "language_loss": 0.64900696, "learning_rate": 6.786505532198374e-07, "loss": 0.67079914, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.48842191696167 }, { "auxiliary_loss_clip": 0.01167388, "auxiliary_loss_mlp": 0.01027036, "balance_loss_clip": 1.04974973, "balance_loss_mlp": 1.01974916, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 1.8599968147522656, "language_loss": 0.85082793, "learning_rate": 6.780659010162411e-07, "loss": 0.8727721, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.5080342292785645 }, { "auxiliary_loss_clip": 0.01129209, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 1.04591942, "balance_loss_mlp": 1.02073038, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 1.8481285965914338, "language_loss": 0.83218491, "learning_rate": 6.774814493489975e-07, "loss": 0.85374874, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.5578792095184326 }, { "auxiliary_loss_clip": 0.011462, "auxiliary_loss_mlp": 0.01033372, "balance_loss_clip": 1.04509962, "balance_loss_mlp": 1.02675033, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 1.646596254176413, "language_loss": 0.65994352, "learning_rate": 6.768971983067655e-07, "loss": 0.68173927, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 2.4776556491851807 }, { "auxiliary_loss_clip": 0.01058477, "auxiliary_loss_mlp": 0.01001773, "balance_loss_clip": 1.01068783, "balance_loss_mlp": 1.00067675, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.0033107505907077, "language_loss": 0.67793834, "learning_rate": 6.763131479781772e-07, "loss": 0.69854081, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 2.883711338043213 }, { "auxiliary_loss_clip": 0.01131139, "auxiliary_loss_mlp": 0.01029041, "balance_loss_clip": 1.04742503, "balance_loss_mlp": 1.02244306, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 1.823553578936655, "language_loss": 0.7588678, "learning_rate": 6.757292984518316e-07, "loss": 0.78046966, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 2.5236330032348633 }, { "auxiliary_loss_clip": 0.01050824, "auxiliary_loss_mlp": 0.01003425, "balance_loss_clip": 1.01204967, "balance_loss_mlp": 1.00231063, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 0.7497340849319888, "language_loss": 0.56442738, "learning_rate": 6.751456498162981e-07, "loss": 0.58496988, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 2.9882025718688965 }, { "auxiliary_loss_clip": 0.01150583, "auxiliary_loss_mlp": 0.01024695, "balance_loss_clip": 1.04466605, "balance_loss_mlp": 1.01801658, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 1.8538237747670587, "language_loss": 0.85803211, "learning_rate": 6.745622021601174e-07, "loss": 0.87978488, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 3.2462685108184814 }, { "auxiliary_loss_clip": 0.01128603, "auxiliary_loss_mlp": 0.01025475, "balance_loss_clip": 1.04579973, "balance_loss_mlp": 1.01805162, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 1.8143213925104127, "language_loss": 0.69765717, "learning_rate": 6.739789555717954e-07, "loss": 0.71919799, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 2.5487778186798096 }, { "auxiliary_loss_clip": 0.01163313, "auxiliary_loss_mlp": 0.010244, "balance_loss_clip": 1.04697871, "balance_loss_mlp": 1.01718152, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 1.9445961306708712, "language_loss": 0.77170813, "learning_rate": 6.733959101398124e-07, "loss": 0.79358524, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 4.044026851654053 }, { "auxiliary_loss_clip": 0.01136018, "auxiliary_loss_mlp": 0.01024581, "balance_loss_clip": 1.04468107, "balance_loss_mlp": 1.01773572, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 1.7133266615157359, "language_loss": 0.81439817, "learning_rate": 6.728130659526143e-07, "loss": 0.83600414, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 2.5228497982025146 }, { "auxiliary_loss_clip": 0.01140278, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.04778123, "balance_loss_mlp": 1.02225721, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 2.3783216644567795, "language_loss": 0.70805442, "learning_rate": 6.7223042309862e-07, "loss": 0.72975099, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.5813348293304443 }, { "auxiliary_loss_clip": 0.01148582, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.04497266, "balance_loss_mlp": 1.02106202, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 2.3630964804722545, "language_loss": 0.73700744, "learning_rate": 6.716479816662144e-07, "loss": 0.75877357, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 2.568819284439087 }, { "auxiliary_loss_clip": 0.01141577, "auxiliary_loss_mlp": 0.01023097, "balance_loss_clip": 1.04539204, "balance_loss_mlp": 1.01606989, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 1.8353345491842068, "language_loss": 0.72944868, "learning_rate": 6.710657417437531e-07, "loss": 0.75109535, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.55751371383667 }, { "auxiliary_loss_clip": 0.01135284, "auxiliary_loss_mlp": 0.01020349, "balance_loss_clip": 1.04534888, "balance_loss_mlp": 1.01374221, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.482095309706797, "language_loss": 0.80376053, "learning_rate": 6.704837034195628e-07, "loss": 0.82531691, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.514275074005127 }, { "auxiliary_loss_clip": 0.01143688, "auxiliary_loss_mlp": 0.01030188, "balance_loss_clip": 1.04574323, "balance_loss_mlp": 1.02292848, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 1.6466144073528222, "language_loss": 0.84782302, "learning_rate": 6.699018667819376e-07, "loss": 0.86956179, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.507328510284424 }, { "auxiliary_loss_clip": 0.01144925, "auxiliary_loss_mlp": 0.01028532, "balance_loss_clip": 1.04324925, "balance_loss_mlp": 1.02113485, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.5306626590793733, "language_loss": 0.72938156, "learning_rate": 6.693202319191415e-07, "loss": 0.75111616, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.551039934158325 }, { "auxiliary_loss_clip": 0.01165913, "auxiliary_loss_mlp": 0.01030111, "balance_loss_clip": 1.05101967, "balance_loss_mlp": 1.02299428, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 1.776993029654492, "language_loss": 0.74641693, "learning_rate": 6.687387989194084e-07, "loss": 0.76837718, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.4801669120788574 }, { "auxiliary_loss_clip": 0.01128423, "auxiliary_loss_mlp": 0.01022082, "balance_loss_clip": 1.04474831, "balance_loss_mlp": 1.01516461, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 1.8371343245252565, "language_loss": 0.7927689, "learning_rate": 6.681575678709404e-07, "loss": 0.81427395, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.510317802429199 }, { "auxiliary_loss_clip": 0.01148033, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.04461813, "balance_loss_mlp": 1.02190685, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 2.0671192980471464, "language_loss": 0.70752859, "learning_rate": 6.67576538861911e-07, "loss": 0.72929901, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 2.4956181049346924 }, { "auxiliary_loss_clip": 0.01133983, "auxiliary_loss_mlp": 0.01022771, "balance_loss_clip": 1.04637492, "balance_loss_mlp": 1.01630664, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 2.631590792316345, "language_loss": 0.82064962, "learning_rate": 6.669957119804612e-07, "loss": 0.84221721, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.5605835914611816 }, { "auxiliary_loss_clip": 0.01144329, "auxiliary_loss_mlp": 0.01024965, "balance_loss_clip": 1.04542327, "balance_loss_mlp": 1.01712656, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 2.8954011904520573, "language_loss": 0.72540522, "learning_rate": 6.66415087314702e-07, "loss": 0.74709821, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.5082178115844727 }, { "auxiliary_loss_clip": 0.01137277, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.04382348, "balance_loss_mlp": 1.01998413, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 2.188600692675026, "language_loss": 0.72843164, "learning_rate": 6.65834664952714e-07, "loss": 0.75007153, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 3.2086164951324463 }, { "auxiliary_loss_clip": 0.01123061, "auxiliary_loss_mlp": 0.01027088, "balance_loss_clip": 1.04206991, "balance_loss_mlp": 1.01990891, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 1.6569265027966444, "language_loss": 0.75860471, "learning_rate": 6.652544449825457e-07, "loss": 0.78010619, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.5593631267547607 }, { "auxiliary_loss_clip": 0.0114183, "auxiliary_loss_mlp": 0.01027589, "balance_loss_clip": 1.04601073, "balance_loss_mlp": 1.02042484, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 1.6266490807209797, "language_loss": 0.76552564, "learning_rate": 6.646744274922182e-07, "loss": 0.78721988, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.5394034385681152 }, { "auxiliary_loss_clip": 0.01137258, "auxiliary_loss_mlp": 0.01030683, "balance_loss_clip": 1.04361892, "balance_loss_mlp": 1.0239656, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 2.6144675404741156, "language_loss": 0.75606811, "learning_rate": 6.640946125697171e-07, "loss": 0.77774757, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.5034382343292236 }, { "auxiliary_loss_clip": 0.01151883, "auxiliary_loss_mlp": 0.01024748, "balance_loss_clip": 1.04524815, "balance_loss_mlp": 1.01689816, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 1.7720477352127584, "language_loss": 0.75376821, "learning_rate": 6.635150003030017e-07, "loss": 0.77553445, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.570927381515503 }, { "auxiliary_loss_clip": 0.01106427, "auxiliary_loss_mlp": 0.01025119, "balance_loss_clip": 1.03890634, "balance_loss_mlp": 1.01816344, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 1.944547130207029, "language_loss": 0.86129034, "learning_rate": 6.629355907799981e-07, "loss": 0.88260579, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.5772950649261475 }, { "auxiliary_loss_clip": 0.01154401, "auxiliary_loss_mlp": 0.01028229, "balance_loss_clip": 1.04672194, "balance_loss_mlp": 1.02100825, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 1.687966527970153, "language_loss": 0.69099963, "learning_rate": 6.623563840886015e-07, "loss": 0.71282601, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.5851614475250244 }, { "auxiliary_loss_clip": 0.01146708, "auxiliary_loss_mlp": 0.0102506, "balance_loss_clip": 1.04458117, "balance_loss_mlp": 1.01829481, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 1.6446349389779142, "language_loss": 0.69849735, "learning_rate": 6.617773803166795e-07, "loss": 0.72021502, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 2.4932861328125 }, { "auxiliary_loss_clip": 0.01141592, "auxiliary_loss_mlp": 0.00761077, "balance_loss_clip": 1.04529858, "balance_loss_mlp": 1.00029135, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 2.2104007692142593, "language_loss": 0.81623411, "learning_rate": 6.611985795520634e-07, "loss": 0.83526075, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.5477118492126465 }, { "auxiliary_loss_clip": 0.0113232, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.04623342, "balance_loss_mlp": 1.02059197, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 1.942712286550165, "language_loss": 0.77781779, "learning_rate": 6.606199818825588e-07, "loss": 0.79941666, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 2.5791823863983154 }, { "auxiliary_loss_clip": 0.0114033, "auxiliary_loss_mlp": 0.01028456, "balance_loss_clip": 1.04209375, "balance_loss_mlp": 1.02136326, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 1.8510703675873201, "language_loss": 0.82116914, "learning_rate": 6.600415873959377e-07, "loss": 0.84285694, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 2.5078556537628174 }, { "auxiliary_loss_clip": 0.0109157, "auxiliary_loss_mlp": 0.00759558, "balance_loss_clip": 1.0376941, "balance_loss_mlp": 1.00021398, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 2.2725349970653497, "language_loss": 0.64730954, "learning_rate": 6.594633961799437e-07, "loss": 0.66582084, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 3.4517016410827637 }, { "auxiliary_loss_clip": 0.01132239, "auxiliary_loss_mlp": 0.01030697, "balance_loss_clip": 1.0447346, "balance_loss_mlp": 1.02362823, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 1.5555099974047548, "language_loss": 0.81813264, "learning_rate": 6.588854083222857e-07, "loss": 0.83976197, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 2.572512626647949 }, { "auxiliary_loss_clip": 0.01139197, "auxiliary_loss_mlp": 0.01025401, "balance_loss_clip": 1.04662967, "balance_loss_mlp": 1.01789081, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 1.8318278030641693, "language_loss": 0.80893505, "learning_rate": 6.583076239106444e-07, "loss": 0.83058107, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 3.9722211360931396 }, { "auxiliary_loss_clip": 0.01143202, "auxiliary_loss_mlp": 0.01025115, "balance_loss_clip": 1.04547739, "balance_loss_mlp": 1.0177927, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 3.317409160506621, "language_loss": 0.75216711, "learning_rate": 6.577300430326707e-07, "loss": 0.77385026, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 2.4950003623962402 }, { "auxiliary_loss_clip": 0.01121405, "auxiliary_loss_mlp": 0.01023046, "balance_loss_clip": 1.04563355, "balance_loss_mlp": 1.01645374, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 1.9282927725839085, "language_loss": 0.71799242, "learning_rate": 6.571526657759821e-07, "loss": 0.73943698, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 2.5470693111419678 }, { "auxiliary_loss_clip": 0.0114336, "auxiliary_loss_mlp": 0.010252, "balance_loss_clip": 1.04318297, "balance_loss_mlp": 1.01839972, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.6130269624329108, "language_loss": 0.70783198, "learning_rate": 6.565754922281663e-07, "loss": 0.72951758, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 2.5488967895507812 }, { "auxiliary_loss_clip": 0.01136095, "auxiliary_loss_mlp": 0.01023199, "balance_loss_clip": 1.04440928, "balance_loss_mlp": 1.01633, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 1.7320889884932456, "language_loss": 0.78373528, "learning_rate": 6.559985224767801e-07, "loss": 0.80532825, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.5369863510131836 }, { "auxiliary_loss_clip": 0.01123514, "auxiliary_loss_mlp": 0.01028778, "balance_loss_clip": 1.04377913, "balance_loss_mlp": 1.02162862, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 2.6196292202306553, "language_loss": 0.75514561, "learning_rate": 6.55421756609349e-07, "loss": 0.77666855, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.53421950340271 }, { "auxiliary_loss_clip": 0.01149731, "auxiliary_loss_mlp": 0.01027268, "balance_loss_clip": 1.04975557, "balance_loss_mlp": 1.01991618, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 1.8324216644891627, "language_loss": 0.78696227, "learning_rate": 6.54845194713369e-07, "loss": 0.80873233, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.5513641834259033 }, { "auxiliary_loss_clip": 0.01150066, "auxiliary_loss_mlp": 0.01025688, "balance_loss_clip": 1.04828858, "balance_loss_mlp": 1.0188508, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 2.452339234687682, "language_loss": 0.79440141, "learning_rate": 6.542688368763034e-07, "loss": 0.81615889, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.4721994400024414 }, { "auxiliary_loss_clip": 0.01149363, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.04787886, "balance_loss_mlp": 1.01943862, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 1.5251177458905527, "language_loss": 0.7728911, "learning_rate": 6.536926831855854e-07, "loss": 0.79464489, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.5258538722991943 }, { "auxiliary_loss_clip": 0.01134964, "auxiliary_loss_mlp": 0.01025105, "balance_loss_clip": 1.04668248, "balance_loss_mlp": 1.01805341, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 2.1912590599212947, "language_loss": 0.73036557, "learning_rate": 6.531167337286165e-07, "loss": 0.75196624, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.5401129722595215 }, { "auxiliary_loss_clip": 0.01135142, "auxiliary_loss_mlp": 0.01025519, "balance_loss_clip": 1.04718065, "balance_loss_mlp": 1.01832175, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 1.6145057994589487, "language_loss": 0.79473388, "learning_rate": 6.52540988592768e-07, "loss": 0.81634045, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.5438971519470215 }, { "auxiliary_loss_clip": 0.01141013, "auxiliary_loss_mlp": 0.01026148, "balance_loss_clip": 1.046404, "balance_loss_mlp": 1.01889372, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 1.883145803787023, "language_loss": 0.8327136, "learning_rate": 6.519654478653814e-07, "loss": 0.85438514, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.50787091255188 }, { "auxiliary_loss_clip": 0.01039672, "auxiliary_loss_mlp": 0.01000864, "balance_loss_clip": 1.00979686, "balance_loss_mlp": 0.99971402, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7434844906592243, "language_loss": 0.56135583, "learning_rate": 6.51390111633763e-07, "loss": 0.58176124, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.17803955078125 }, { "auxiliary_loss_clip": 0.01092535, "auxiliary_loss_mlp": 0.01023415, "balance_loss_clip": 1.03901339, "balance_loss_mlp": 1.01658762, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 1.5171715445883973, "language_loss": 0.76304036, "learning_rate": 6.508149799851932e-07, "loss": 0.78419983, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 3.4580113887786865 }, { "auxiliary_loss_clip": 0.01133643, "auxiliary_loss_mlp": 0.01020428, "balance_loss_clip": 1.0451448, "balance_loss_mlp": 1.01401114, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 1.947401492562077, "language_loss": 0.61395884, "learning_rate": 6.502400530069183e-07, "loss": 0.63549948, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 2.5627334117889404 }, { "auxiliary_loss_clip": 0.01126574, "auxiliary_loss_mlp": 0.01029526, "balance_loss_clip": 1.0474, "balance_loss_mlp": 1.02150941, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 1.6370682420757459, "language_loss": 0.68387985, "learning_rate": 6.496653307861535e-07, "loss": 0.70544076, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.5780794620513916 }, { "auxiliary_loss_clip": 0.01155658, "auxiliary_loss_mlp": 0.0102911, "balance_loss_clip": 1.04650307, "balance_loss_mlp": 1.02232063, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 1.944131808770365, "language_loss": 0.65853155, "learning_rate": 6.490908134100857e-07, "loss": 0.68037921, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.4751555919647217 }, { "auxiliary_loss_clip": 0.01155397, "auxiliary_loss_mlp": 0.01026809, "balance_loss_clip": 1.04768372, "balance_loss_mlp": 1.01997864, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 2.0537084445751255, "language_loss": 0.69182646, "learning_rate": 6.48516500965866e-07, "loss": 0.71364856, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.5139377117156982 }, { "auxiliary_loss_clip": 0.01152989, "auxiliary_loss_mlp": 0.01023896, "balance_loss_clip": 1.04461336, "balance_loss_mlp": 1.01697326, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 1.6161538533865363, "language_loss": 0.81556493, "learning_rate": 6.479423935406192e-07, "loss": 0.8373338, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.5277135372161865 }, { "auxiliary_loss_clip": 0.01033325, "auxiliary_loss_mlp": 0.01002864, "balance_loss_clip": 1.0107131, "balance_loss_mlp": 1.00189877, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.8098749032001336, "language_loss": 0.61988962, "learning_rate": 6.473684912214357e-07, "loss": 0.64025152, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.256326198577881 }, { "auxiliary_loss_clip": 0.01151879, "auxiliary_loss_mlp": 0.01029996, "balance_loss_clip": 1.04806519, "balance_loss_mlp": 1.02252734, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 1.9418326515742503, "language_loss": 0.69673413, "learning_rate": 6.467947940953778e-07, "loss": 0.71855283, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 2.4932548999786377 }, { "auxiliary_loss_clip": 0.0113491, "auxiliary_loss_mlp": 0.01025062, "balance_loss_clip": 1.04351151, "balance_loss_mlp": 1.01839256, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 1.862387235930378, "language_loss": 0.72491407, "learning_rate": 6.462213022494732e-07, "loss": 0.74651378, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.537412405014038 }, { "auxiliary_loss_clip": 0.01049656, "auxiliary_loss_mlp": 0.01000739, "balance_loss_clip": 1.01045442, "balance_loss_mlp": 0.99966013, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 2.6854363585244263, "language_loss": 0.61095309, "learning_rate": 6.456480157707201e-07, "loss": 0.63145703, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 3.0377211570739746 }, { "auxiliary_loss_clip": 0.01116945, "auxiliary_loss_mlp": 0.01024211, "balance_loss_clip": 1.04308438, "balance_loss_mlp": 1.01709771, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 2.05414268661914, "language_loss": 0.85190415, "learning_rate": 6.450749347460866e-07, "loss": 0.87331569, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 2.538580894470215 }, { "auxiliary_loss_clip": 0.01167593, "auxiliary_loss_mlp": 0.01026336, "balance_loss_clip": 1.04921508, "balance_loss_mlp": 1.01898718, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 1.7320051721240353, "language_loss": 0.79471231, "learning_rate": 6.445020592625083e-07, "loss": 0.8166517, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.497760057449341 }, { "auxiliary_loss_clip": 0.01163742, "auxiliary_loss_mlp": 0.01022491, "balance_loss_clip": 1.04662657, "balance_loss_mlp": 1.01549983, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 2.6964442897500964, "language_loss": 0.80402261, "learning_rate": 6.4392938940689e-07, "loss": 0.82588494, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 3.1899240016937256 }, { "auxiliary_loss_clip": 0.01105921, "auxiliary_loss_mlp": 0.00760835, "balance_loss_clip": 1.04297066, "balance_loss_mlp": 1.00020301, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 2.3481652368734465, "language_loss": 0.71362913, "learning_rate": 6.433569252661049e-07, "loss": 0.73229671, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 4.112748622894287 }, { "auxiliary_loss_clip": 0.01118376, "auxiliary_loss_mlp": 0.01023584, "balance_loss_clip": 1.04264414, "balance_loss_mlp": 1.01716161, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 1.7625281564161643, "language_loss": 0.71511972, "learning_rate": 6.427846669269952e-07, "loss": 0.73653924, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 2.5607643127441406 }, { "auxiliary_loss_clip": 0.01168595, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.05155122, "balance_loss_mlp": 1.02195609, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 1.9546354663939223, "language_loss": 0.82365346, "learning_rate": 6.422126144763729e-07, "loss": 0.84562474, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.470552444458008 }, { "auxiliary_loss_clip": 0.01121788, "auxiliary_loss_mlp": 0.00761184, "balance_loss_clip": 1.04064679, "balance_loss_mlp": 1.0002017, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 2.0362318241114186, "language_loss": 0.77421319, "learning_rate": 6.416407680010174e-07, "loss": 0.7930429, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.556100368499756 }, { "auxiliary_loss_clip": 0.01118633, "auxiliary_loss_mlp": 0.01028224, "balance_loss_clip": 1.04396021, "balance_loss_mlp": 1.02143836, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 2.0249996112742257, "language_loss": 0.80610418, "learning_rate": 6.410691275876774e-07, "loss": 0.82757282, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 2.618307590484619 }, { "auxiliary_loss_clip": 0.0114587, "auxiliary_loss_mlp": 0.01022999, "balance_loss_clip": 1.04893613, "balance_loss_mlp": 1.01635563, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 2.2401438240346288, "language_loss": 0.77095437, "learning_rate": 6.404976933230704e-07, "loss": 0.79264307, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.5525009632110596 }, { "auxiliary_loss_clip": 0.01143448, "auxiliary_loss_mlp": 0.01022944, "balance_loss_clip": 1.04619694, "balance_loss_mlp": 1.01566005, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 2.2203568004828798, "language_loss": 0.72475088, "learning_rate": 6.399264652938813e-07, "loss": 0.74641478, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.626080274581909 }, { "auxiliary_loss_clip": 0.01137248, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 1.04484487, "balance_loss_mlp": 1.02005136, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 1.91052677467262, "language_loss": 0.74709845, "learning_rate": 6.393554435867679e-07, "loss": 0.76874286, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.5494868755340576 }, { "auxiliary_loss_clip": 0.01119765, "auxiliary_loss_mlp": 0.01025691, "balance_loss_clip": 1.04212856, "balance_loss_mlp": 1.01838982, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 5.854153783758928, "language_loss": 0.83494306, "learning_rate": 6.387846282883502e-07, "loss": 0.85639763, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 2.580301284790039 }, { "auxiliary_loss_clip": 0.01163707, "auxiliary_loss_mlp": 0.01025772, "balance_loss_clip": 1.04860544, "balance_loss_mlp": 1.01872075, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 1.9888718236730922, "language_loss": 0.76922107, "learning_rate": 6.38214019485223e-07, "loss": 0.79111588, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.471585988998413 }, { "auxiliary_loss_clip": 0.0109036, "auxiliary_loss_mlp": 0.01024073, "balance_loss_clip": 1.03860855, "balance_loss_mlp": 1.01737094, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 1.7473812132094726, "language_loss": 0.71357512, "learning_rate": 6.376436172639461e-07, "loss": 0.73471951, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.598330497741699 }, { "auxiliary_loss_clip": 0.01081242, "auxiliary_loss_mlp": 0.0102782, "balance_loss_clip": 1.03997993, "balance_loss_mlp": 1.020226, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 2.964567765805751, "language_loss": 0.64868027, "learning_rate": 6.370734217110487e-07, "loss": 0.6697709, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.7120487689971924 }, { "auxiliary_loss_clip": 0.01142776, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.04701436, "balance_loss_mlp": 1.01978779, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 1.3741774626888217, "language_loss": 0.64222562, "learning_rate": 6.36503432913031e-07, "loss": 0.66393101, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 2.922898292541504 }, { "auxiliary_loss_clip": 0.01150229, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.04766035, "balance_loss_mlp": 1.02104867, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 3.9942417438002353, "language_loss": 0.68906212, "learning_rate": 6.359336509563569e-07, "loss": 0.71085346, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 3.223353385925293 }, { "auxiliary_loss_clip": 0.01109996, "auxiliary_loss_mlp": 0.01028084, "balance_loss_clip": 1.04169154, "balance_loss_mlp": 1.02106893, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 1.7070421499167272, "language_loss": 0.80409968, "learning_rate": 6.353640759274641e-07, "loss": 0.82548046, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 2.5277109146118164 }, { "auxiliary_loss_clip": 0.01148584, "auxiliary_loss_mlp": 0.01027856, "balance_loss_clip": 1.04336596, "balance_loss_mlp": 1.0207001, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 3.3349130226226418, "language_loss": 0.73958749, "learning_rate": 6.347947079127556e-07, "loss": 0.76135188, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.5230050086975098 }, { "auxiliary_loss_clip": 0.01132835, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.04498696, "balance_loss_mlp": 1.02149951, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 2.9075449048490563, "language_loss": 0.76228839, "learning_rate": 6.342255469986053e-07, "loss": 0.78390288, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.5390121936798096 }, { "auxiliary_loss_clip": 0.01164927, "auxiliary_loss_mlp": 0.01023461, "balance_loss_clip": 1.04870117, "balance_loss_mlp": 1.01661503, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 1.7048372786251287, "language_loss": 0.76533854, "learning_rate": 6.336565932713533e-07, "loss": 0.78722239, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.5210957527160645 }, { "auxiliary_loss_clip": 0.0113371, "auxiliary_loss_mlp": 0.01027818, "balance_loss_clip": 1.04558325, "balance_loss_mlp": 1.02040899, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 1.7392802217638244, "language_loss": 0.77739358, "learning_rate": 6.330878468173088e-07, "loss": 0.79900885, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.559904098510742 }, { "auxiliary_loss_clip": 0.01141581, "auxiliary_loss_mlp": 0.01026781, "balance_loss_clip": 1.04366863, "balance_loss_mlp": 1.01977694, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 1.700542724937735, "language_loss": 0.73017275, "learning_rate": 6.32519307722752e-07, "loss": 0.75185633, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.4790258407592773 }, { "auxiliary_loss_clip": 0.01022403, "auxiliary_loss_mlp": 0.01001973, "balance_loss_clip": 1.01054096, "balance_loss_mlp": 1.00092399, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.8994235518745616, "language_loss": 0.54997462, "learning_rate": 6.31950976073929e-07, "loss": 0.57021838, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 3.1623966693878174 }, { "auxiliary_loss_clip": 0.01104223, "auxiliary_loss_mlp": 0.01027959, "balance_loss_clip": 1.04217434, "balance_loss_mlp": 1.02008569, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 1.9995919926693717, "language_loss": 0.81019992, "learning_rate": 6.31382851957055e-07, "loss": 0.83152175, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.593344211578369 }, { "auxiliary_loss_clip": 0.01122455, "auxiliary_loss_mlp": 0.00760589, "balance_loss_clip": 1.04594731, "balance_loss_mlp": 1.00021648, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 1.9310954025710074, "language_loss": 0.71458554, "learning_rate": 6.308149354583143e-07, "loss": 0.73341602, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 2.6697282791137695 }, { "auxiliary_loss_clip": 0.01155813, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.04795444, "balance_loss_mlp": 1.02284956, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 1.8553642674760444, "language_loss": 0.81663883, "learning_rate": 6.302472266638586e-07, "loss": 0.83850861, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.5561609268188477 }, { "auxiliary_loss_clip": 0.01172719, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.05093002, "balance_loss_mlp": 1.02310753, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.5993052402991217, "language_loss": 0.70253623, "learning_rate": 6.296797256598101e-07, "loss": 0.72457576, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.600849151611328 }, { "auxiliary_loss_clip": 0.01114294, "auxiliary_loss_mlp": 0.01025195, "balance_loss_clip": 1.04201365, "balance_loss_mlp": 1.01836991, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 1.6543087262106295, "language_loss": 0.814179, "learning_rate": 6.291124325322576e-07, "loss": 0.83557391, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 3.377903938293457 }, { "auxiliary_loss_clip": 0.01141815, "auxiliary_loss_mlp": 0.01023043, "balance_loss_clip": 1.04591477, "balance_loss_mlp": 1.01595628, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.6001818472873717, "language_loss": 0.62480676, "learning_rate": 6.285453473672595e-07, "loss": 0.64645541, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 4.21563458442688 }, { "auxiliary_loss_clip": 0.01162535, "auxiliary_loss_mlp": 0.01026449, "balance_loss_clip": 1.04682112, "balance_loss_mlp": 1.01948714, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 1.9302349127005114, "language_loss": 0.7559973, "learning_rate": 6.279784702508415e-07, "loss": 0.77788711, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 2.4753615856170654 }, { "auxiliary_loss_clip": 0.0102601, "auxiliary_loss_mlp": 0.01001275, "balance_loss_clip": 1.00908518, "balance_loss_mlp": 1.00010681, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7727059596762108, "language_loss": 0.58593297, "learning_rate": 6.274118012689979e-07, "loss": 0.60620582, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.24021577835083 }, { "auxiliary_loss_clip": 0.01129283, "auxiliary_loss_mlp": 0.01029064, "balance_loss_clip": 1.04390049, "balance_loss_mlp": 1.02111602, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.4448072906289855, "language_loss": 0.67984658, "learning_rate": 6.268453405076943e-07, "loss": 0.70143008, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 2.6113390922546387 }, { "auxiliary_loss_clip": 0.01138838, "auxiliary_loss_mlp": 0.01025389, "balance_loss_clip": 1.04516077, "balance_loss_mlp": 1.01911569, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 2.1166555255141954, "language_loss": 0.82033408, "learning_rate": 6.262790880528592e-07, "loss": 0.84197634, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.51828932762146 }, { "auxiliary_loss_clip": 0.01130789, "auxiliary_loss_mlp": 0.01026119, "balance_loss_clip": 1.04139209, "balance_loss_mlp": 1.0187701, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.372877871825076, "language_loss": 0.79803568, "learning_rate": 6.257130439903951e-07, "loss": 0.81960475, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.5349538326263428 }, { "auxiliary_loss_clip": 0.01169141, "auxiliary_loss_mlp": 0.01030798, "balance_loss_clip": 1.0506469, "balance_loss_mlp": 1.02367508, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 1.7849136989866308, "language_loss": 0.812433, "learning_rate": 6.251472084061695e-07, "loss": 0.83443236, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.4966917037963867 }, { "auxiliary_loss_clip": 0.01150643, "auxiliary_loss_mlp": 0.01027923, "balance_loss_clip": 1.04748654, "balance_loss_mlp": 1.02101469, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 4.99044604885606, "language_loss": 0.88620651, "learning_rate": 6.245815813860191e-07, "loss": 0.90799212, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.4961092472076416 }, { "auxiliary_loss_clip": 0.01167032, "auxiliary_loss_mlp": 0.0102541, "balance_loss_clip": 1.04863238, "balance_loss_mlp": 1.01759005, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 1.9230898194822925, "language_loss": 0.70347106, "learning_rate": 6.240161630157495e-07, "loss": 0.7253955, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.472421884536743 }, { "auxiliary_loss_clip": 0.01167459, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.04819429, "balance_loss_mlp": 1.02085495, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 2.54103356269205, "language_loss": 0.6970799, "learning_rate": 6.23450953381133e-07, "loss": 0.71904725, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.459921360015869 }, { "auxiliary_loss_clip": 0.01130608, "auxiliary_loss_mlp": 0.01021578, "balance_loss_clip": 1.04411805, "balance_loss_mlp": 1.01509261, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 2.5526628539795913, "language_loss": 0.67994773, "learning_rate": 6.228859525679131e-07, "loss": 0.70146954, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.5213265419006348 }, { "auxiliary_loss_clip": 0.01153059, "auxiliary_loss_mlp": 0.01024283, "balance_loss_clip": 1.04804552, "balance_loss_mlp": 1.01781583, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 2.0380430648144823, "language_loss": 0.80231297, "learning_rate": 6.223211606617986e-07, "loss": 0.82408631, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.5344016551971436 }, { "auxiliary_loss_clip": 0.01150538, "auxiliary_loss_mlp": 0.01026749, "balance_loss_clip": 1.05020845, "balance_loss_mlp": 1.02028513, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 3.1885884685393346, "language_loss": 0.83864295, "learning_rate": 6.217565777484701e-07, "loss": 0.86041582, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 2.4919426441192627 }, { "auxiliary_loss_clip": 0.01137028, "auxiliary_loss_mlp": 0.0076013, "balance_loss_clip": 1.04636633, "balance_loss_mlp": 1.00022376, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 1.771204028295865, "language_loss": 0.80247426, "learning_rate": 6.211922039135722e-07, "loss": 0.8214457, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 3.285532236099243 }, { "auxiliary_loss_clip": 0.01166463, "auxiliary_loss_mlp": 0.01025736, "balance_loss_clip": 1.04947138, "balance_loss_mlp": 1.01903915, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 2.1068868092801103, "language_loss": 0.80884236, "learning_rate": 6.206280392427201e-07, "loss": 0.83076435, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 2.4761288166046143 }, { "auxiliary_loss_clip": 0.01143369, "auxiliary_loss_mlp": 0.01031811, "balance_loss_clip": 1.04351485, "balance_loss_mlp": 1.02446759, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 1.5091953904784017, "language_loss": 0.73846662, "learning_rate": 6.200640838214983e-07, "loss": 0.76021838, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.601134777069092 }, { "auxiliary_loss_clip": 0.01163226, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.04758584, "balance_loss_mlp": 1.02310848, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 1.9389816835684843, "language_loss": 0.67235982, "learning_rate": 6.195003377354578e-07, "loss": 0.69428772, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.4516124725341797 }, { "auxiliary_loss_clip": 0.0114817, "auxiliary_loss_mlp": 0.0102854, "balance_loss_clip": 1.04513168, "balance_loss_mlp": 1.02085066, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 2.638826809105794, "language_loss": 0.7319572, "learning_rate": 6.189368010701183e-07, "loss": 0.75372434, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.499545097351074 }, { "auxiliary_loss_clip": 0.01154288, "auxiliary_loss_mlp": 0.01021391, "balance_loss_clip": 1.04437017, "balance_loss_mlp": 1.01459289, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 1.8034852588708608, "language_loss": 0.76735997, "learning_rate": 6.183734739109683e-07, "loss": 0.78911674, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.467649459838867 }, { "auxiliary_loss_clip": 0.01156224, "auxiliary_loss_mlp": 0.01029195, "balance_loss_clip": 1.04915023, "balance_loss_mlp": 1.0214107, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 1.9215701819878643, "language_loss": 0.68897903, "learning_rate": 6.178103563434629e-07, "loss": 0.71083319, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.5358595848083496 }, { "auxiliary_loss_clip": 0.0116552, "auxiliary_loss_mlp": 0.01025141, "balance_loss_clip": 1.04878569, "balance_loss_mlp": 1.01825666, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.7171033807334242, "language_loss": 0.83774853, "learning_rate": 6.172474484530283e-07, "loss": 0.85965514, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 2.47774076461792 }, { "auxiliary_loss_clip": 0.01125085, "auxiliary_loss_mlp": 0.01026248, "balance_loss_clip": 1.04063988, "balance_loss_mlp": 1.01869583, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 1.602233524873449, "language_loss": 0.75927943, "learning_rate": 6.166847503250563e-07, "loss": 0.78079277, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.6686129570007324 }, { "auxiliary_loss_clip": 0.01138057, "auxiliary_loss_mlp": 0.01027409, "balance_loss_clip": 1.04449058, "balance_loss_mlp": 1.02070689, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 2.2627598009922743, "language_loss": 0.79118818, "learning_rate": 6.161222620449078e-07, "loss": 0.81284285, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 2.528900146484375 }, { "auxiliary_loss_clip": 0.01127078, "auxiliary_loss_mlp": 0.01030012, "balance_loss_clip": 1.04624796, "balance_loss_mlp": 1.02321458, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 2.2601726393376462, "language_loss": 0.80108911, "learning_rate": 6.155599836979117e-07, "loss": 0.82266003, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.597461223602295 }, { "auxiliary_loss_clip": 0.01111689, "auxiliary_loss_mlp": 0.01026548, "balance_loss_clip": 1.04207587, "balance_loss_mlp": 1.01945829, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 1.9058818973385154, "language_loss": 0.8140046, "learning_rate": 6.149979153693649e-07, "loss": 0.83538699, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.5822956562042236 }, { "auxiliary_loss_clip": 0.01146268, "auxiliary_loss_mlp": 0.01026444, "balance_loss_clip": 1.04459655, "balance_loss_mlp": 1.01924956, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 1.842419003075821, "language_loss": 0.76897597, "learning_rate": 6.144360571445343e-07, "loss": 0.79070312, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 3.279876232147217 }, { "auxiliary_loss_clip": 0.01148575, "auxiliary_loss_mlp": 0.01024347, "balance_loss_clip": 1.04829168, "balance_loss_mlp": 1.01716208, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.6579890181522157, "language_loss": 0.79871202, "learning_rate": 6.138744091086509e-07, "loss": 0.82044125, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 4.070315361022949 }, { "auxiliary_loss_clip": 0.01125105, "auxiliary_loss_mlp": 0.01025544, "balance_loss_clip": 1.04602599, "balance_loss_mlp": 1.01896119, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 3.1489178011242736, "language_loss": 0.72777104, "learning_rate": 6.133129713469183e-07, "loss": 0.74927759, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.6243326663970947 }, { "auxiliary_loss_clip": 0.01132516, "auxiliary_loss_mlp": 0.01023806, "balance_loss_clip": 1.04316998, "balance_loss_mlp": 1.01701427, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 1.8011596404267367, "language_loss": 0.6419872, "learning_rate": 6.127517439445053e-07, "loss": 0.66355044, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 2.697223663330078 }, { "auxiliary_loss_clip": 0.01100254, "auxiliary_loss_mlp": 0.01028183, "balance_loss_clip": 1.04193938, "balance_loss_mlp": 1.02183175, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 1.9460801815966915, "language_loss": 0.81582463, "learning_rate": 6.121907269865498e-07, "loss": 0.83710903, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 2.64725923538208 }, { "auxiliary_loss_clip": 0.01023803, "auxiliary_loss_mlp": 0.01000779, "balance_loss_clip": 1.00918746, "balance_loss_mlp": 0.99981368, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.9244843124448847, "language_loss": 0.67310929, "learning_rate": 6.116299205581577e-07, "loss": 0.6933552, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.1114208698272705 }, { "auxiliary_loss_clip": 0.01170699, "auxiliary_loss_mlp": 0.01026656, "balance_loss_clip": 1.05055535, "balance_loss_mlp": 1.01883006, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 2.126519314132762, "language_loss": 0.68759048, "learning_rate": 6.110693247444018e-07, "loss": 0.70956409, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.6071577072143555 }, { "auxiliary_loss_clip": 0.01111513, "auxiliary_loss_mlp": 0.01022226, "balance_loss_clip": 1.04238677, "balance_loss_mlp": 1.01600647, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 1.7800000375566, "language_loss": 0.82469749, "learning_rate": 6.105089396303258e-07, "loss": 0.84603488, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.553864002227783 }, { "auxiliary_loss_clip": 0.0113732, "auxiliary_loss_mlp": 0.01029137, "balance_loss_clip": 1.04374504, "balance_loss_mlp": 1.02211547, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 1.8285768815249757, "language_loss": 0.75537407, "learning_rate": 6.099487653009383e-07, "loss": 0.77703863, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.6359426975250244 }, { "auxiliary_loss_clip": 0.01148764, "auxiliary_loss_mlp": 0.01025441, "balance_loss_clip": 1.04555893, "balance_loss_mlp": 1.01880443, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 1.9740247538150926, "language_loss": 0.83276796, "learning_rate": 6.093888018412192e-07, "loss": 0.85451007, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 2.5164895057678223 }, { "auxiliary_loss_clip": 0.01048034, "auxiliary_loss_mlp": 0.01001202, "balance_loss_clip": 1.00867832, "balance_loss_mlp": 1.00009966, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.7095707323809946, "language_loss": 0.54666471, "learning_rate": 6.088290493361125e-07, "loss": 0.56715715, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.249915599822998 }, { "auxiliary_loss_clip": 0.01098488, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.04110265, "balance_loss_mlp": 1.02133632, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 2.0360812768208953, "language_loss": 0.71430731, "learning_rate": 6.082695078705322e-07, "loss": 0.73557818, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 2.5667903423309326 }, { "auxiliary_loss_clip": 0.01145227, "auxiliary_loss_mlp": 0.01028798, "balance_loss_clip": 1.04615259, "balance_loss_mlp": 1.0215323, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 2.1002009197052955, "language_loss": 0.68446505, "learning_rate": 6.077101775293618e-07, "loss": 0.70620525, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.511019706726074 }, { "auxiliary_loss_clip": 0.01151298, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.04554451, "balance_loss_mlp": 1.01756787, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 2.3173733357821527, "language_loss": 0.82673168, "learning_rate": 6.071510583974504e-07, "loss": 0.84849709, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 2.4617974758148193 }, { "auxiliary_loss_clip": 0.01166716, "auxiliary_loss_mlp": 0.01030779, "balance_loss_clip": 1.04835665, "balance_loss_mlp": 1.02344429, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 1.8979817014794291, "language_loss": 0.7191686, "learning_rate": 6.065921505596161e-07, "loss": 0.74114358, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 3.129119634628296 }, { "auxiliary_loss_clip": 0.01121149, "auxiliary_loss_mlp": 0.01024487, "balance_loss_clip": 1.04439378, "balance_loss_mlp": 1.01780558, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 1.7617995249140659, "language_loss": 0.76778251, "learning_rate": 6.060334541006445e-07, "loss": 0.78923887, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 2.5495619773864746 }, { "auxiliary_loss_clip": 0.01123816, "auxiliary_loss_mlp": 0.01029191, "balance_loss_clip": 1.03987014, "balance_loss_mlp": 1.02245557, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 1.6351522340215003, "language_loss": 0.68991965, "learning_rate": 6.05474969105289e-07, "loss": 0.71144974, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.6634681224823 }, { "auxiliary_loss_clip": 0.01151708, "auxiliary_loss_mlp": 0.01024727, "balance_loss_clip": 1.04756951, "balance_loss_mlp": 1.01759219, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 2.033676205793127, "language_loss": 0.73727804, "learning_rate": 6.049166956582725e-07, "loss": 0.75904238, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.492546558380127 }, { "auxiliary_loss_clip": 0.0114851, "auxiliary_loss_mlp": 0.01024319, "balance_loss_clip": 1.04667783, "balance_loss_mlp": 1.01783097, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 3.5740809725565783, "language_loss": 0.87490135, "learning_rate": 6.043586338442841e-07, "loss": 0.89662975, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.528953790664673 }, { "auxiliary_loss_clip": 0.01163866, "auxiliary_loss_mlp": 0.01024442, "balance_loss_clip": 1.04995918, "balance_loss_mlp": 1.01829064, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 1.7504916463998936, "language_loss": 0.72972959, "learning_rate": 6.038007837479815e-07, "loss": 0.75161266, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.496040105819702 }, { "auxiliary_loss_clip": 0.01149559, "auxiliary_loss_mlp": 0.01023252, "balance_loss_clip": 1.04767919, "balance_loss_mlp": 1.01642752, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 1.879179788386603, "language_loss": 0.64003956, "learning_rate": 6.032431454539897e-07, "loss": 0.66176772, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.505577564239502 }, { "auxiliary_loss_clip": 0.01124689, "auxiliary_loss_mlp": 0.01025629, "balance_loss_clip": 1.04388583, "balance_loss_mlp": 1.01900733, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 1.7480969815319523, "language_loss": 0.81738132, "learning_rate": 6.026857190469014e-07, "loss": 0.83888447, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 2.6259500980377197 }, { "auxiliary_loss_clip": 0.01138607, "auxiliary_loss_mlp": 0.01024107, "balance_loss_clip": 1.04597068, "balance_loss_mlp": 1.01667178, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 1.8454125871671823, "language_loss": 0.74463296, "learning_rate": 6.0212850461128e-07, "loss": 0.76626015, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.5406906604766846 }, { "auxiliary_loss_clip": 0.01138766, "auxiliary_loss_mlp": 0.01023863, "balance_loss_clip": 1.04377365, "balance_loss_mlp": 1.01648974, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 3.1456981106703363, "language_loss": 0.74921876, "learning_rate": 6.015715022316516e-07, "loss": 0.77084506, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 2.524465799331665 }, { "auxiliary_loss_clip": 0.01108246, "auxiliary_loss_mlp": 0.01020818, "balance_loss_clip": 1.04115987, "balance_loss_mlp": 1.01390946, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 2.54667103058873, "language_loss": 0.77206051, "learning_rate": 6.010147119925154e-07, "loss": 0.79335117, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.5998222827911377 }, { "auxiliary_loss_clip": 0.0111202, "auxiliary_loss_mlp": 0.01027199, "balance_loss_clip": 1.0422157, "balance_loss_mlp": 1.02020085, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 1.8524596076190376, "language_loss": 0.65988791, "learning_rate": 6.004581339783348e-07, "loss": 0.68128014, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 3.36283278465271 }, { "auxiliary_loss_clip": 0.01156403, "auxiliary_loss_mlp": 0.01032928, "balance_loss_clip": 1.04817319, "balance_loss_mlp": 1.02449954, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 4.1297571692097845, "language_loss": 0.68691266, "learning_rate": 5.999017682735425e-07, "loss": 0.70880598, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 2.4745991230010986 }, { "auxiliary_loss_clip": 0.01102501, "auxiliary_loss_mlp": 0.01027725, "balance_loss_clip": 1.04162598, "balance_loss_mlp": 1.02046561, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 1.898925435082557, "language_loss": 0.66423893, "learning_rate": 5.993456149625387e-07, "loss": 0.68554127, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 4.276025295257568 }, { "auxiliary_loss_clip": 0.01114641, "auxiliary_loss_mlp": 0.01028079, "balance_loss_clip": 1.04306865, "balance_loss_mlp": 1.02129936, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.734874114649532, "language_loss": 0.82523203, "learning_rate": 5.987896741296909e-07, "loss": 0.84665924, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 2.572920799255371 }, { "auxiliary_loss_clip": 0.01138407, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 1.04792833, "balance_loss_mlp": 1.01945651, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 2.0298378408724207, "language_loss": 0.77934515, "learning_rate": 5.982339458593361e-07, "loss": 0.80099297, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 2.554483652114868 }, { "auxiliary_loss_clip": 0.01147542, "auxiliary_loss_mlp": 0.00760305, "balance_loss_clip": 1.04771543, "balance_loss_mlp": 1.00026536, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 1.6260373328700288, "language_loss": 0.84002388, "learning_rate": 5.976784302357767e-07, "loss": 0.85910237, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 2.559769630432129 }, { "auxiliary_loss_clip": 0.01154288, "auxiliary_loss_mlp": 0.01027358, "balance_loss_clip": 1.04864573, "balance_loss_mlp": 1.02060437, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 1.7530932220885231, "language_loss": 0.73919082, "learning_rate": 5.971231273432855e-07, "loss": 0.76100731, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.489847183227539 }, { "auxiliary_loss_clip": 0.01049335, "auxiliary_loss_mlp": 0.01002518, "balance_loss_clip": 1.01093113, "balance_loss_mlp": 1.00140309, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.8178661607288347, "language_loss": 0.54593796, "learning_rate": 5.965680372661e-07, "loss": 0.56645644, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.0285227298736572 }, { "auxiliary_loss_clip": 0.01138612, "auxiliary_loss_mlp": 0.01019769, "balance_loss_clip": 1.04817033, "balance_loss_mlp": 1.01345956, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 1.7293891092475546, "language_loss": 0.56356275, "learning_rate": 5.960131600884266e-07, "loss": 0.58514655, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.5642974376678467 }, { "auxiliary_loss_clip": 0.0112426, "auxiliary_loss_mlp": 0.01022194, "balance_loss_clip": 1.04350984, "balance_loss_mlp": 1.01607251, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 1.6271385526730684, "language_loss": 0.76204622, "learning_rate": 5.954584958944413e-07, "loss": 0.78351074, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.6135056018829346 }, { "auxiliary_loss_clip": 0.01124877, "auxiliary_loss_mlp": 0.00760481, "balance_loss_clip": 1.0423578, "balance_loss_mlp": 1.00022769, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 1.9333301126501194, "language_loss": 0.81627566, "learning_rate": 5.949040447682854e-07, "loss": 0.83512926, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 2.598297595977783 }, { "auxiliary_loss_clip": 0.01142553, "auxiliary_loss_mlp": 0.01024692, "balance_loss_clip": 1.04554546, "balance_loss_mlp": 1.01758111, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 2.6332056672184128, "language_loss": 0.6866976, "learning_rate": 5.943498067940686e-07, "loss": 0.70837009, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.5193393230438232 }, { "auxiliary_loss_clip": 0.01128036, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.04693544, "balance_loss_mlp": 1.02374232, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 1.794549547219913, "language_loss": 0.81447613, "learning_rate": 5.937957820558686e-07, "loss": 0.836061, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 2.567260265350342 }, { "auxiliary_loss_clip": 0.01033864, "auxiliary_loss_mlp": 0.01000616, "balance_loss_clip": 1.00808001, "balance_loss_mlp": 0.99950695, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.8472660789058853, "language_loss": 0.65450746, "learning_rate": 5.932419706377296e-07, "loss": 0.67485225, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.076275587081909 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.01027472, "balance_loss_clip": 1.04768693, "balance_loss_mlp": 1.02019691, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 1.9585203073361095, "language_loss": 0.74198288, "learning_rate": 5.92688372623666e-07, "loss": 0.76346564, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 3.4155259132385254 }, { "auxiliary_loss_clip": 0.0115019, "auxiliary_loss_mlp": 0.01023002, "balance_loss_clip": 1.04473758, "balance_loss_mlp": 1.01528311, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 1.9530731473118517, "language_loss": 0.74086165, "learning_rate": 5.921349880976574e-07, "loss": 0.76259363, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.473651170730591 }, { "auxiliary_loss_clip": 0.01138407, "auxiliary_loss_mlp": 0.00760438, "balance_loss_clip": 1.04275894, "balance_loss_mlp": 1.00024307, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 2.5936671517322982, "language_loss": 0.8199861, "learning_rate": 5.915818171436515e-07, "loss": 0.83897454, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 2.5446269512176514 }, { "auxiliary_loss_clip": 0.01133963, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 1.04004383, "balance_loss_mlp": 1.02143705, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 1.6621966362084493, "language_loss": 0.7503475, "learning_rate": 5.910288598455642e-07, "loss": 0.77197438, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.520799398422241 }, { "auxiliary_loss_clip": 0.01156047, "auxiliary_loss_mlp": 0.01025518, "balance_loss_clip": 1.04614234, "balance_loss_mlp": 1.01775742, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.2020485704627792, "language_loss": 0.74840701, "learning_rate": 5.90476116287278e-07, "loss": 0.77022266, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.4727063179016113 }, { "auxiliary_loss_clip": 0.0113926, "auxiliary_loss_mlp": 0.01026294, "balance_loss_clip": 1.04804504, "balance_loss_mlp": 1.01927876, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 1.8818925405281577, "language_loss": 0.68192637, "learning_rate": 5.899235865526456e-07, "loss": 0.70358193, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.5118982791900635 }, { "auxiliary_loss_clip": 0.01115276, "auxiliary_loss_mlp": 0.01022857, "balance_loss_clip": 1.04299402, "balance_loss_mlp": 1.01616025, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 1.6548226669149304, "language_loss": 0.82116491, "learning_rate": 5.893712707254825e-07, "loss": 0.84254622, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.5548954010009766 }, { "auxiliary_loss_clip": 0.0110315, "auxiliary_loss_mlp": 0.01026026, "balance_loss_clip": 1.0386796, "balance_loss_mlp": 1.01920748, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 2.8261807469211075, "language_loss": 0.66233253, "learning_rate": 5.888191688895769e-07, "loss": 0.68362433, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.5610146522521973 }, { "auxiliary_loss_clip": 0.01163521, "auxiliary_loss_mlp": 0.01021311, "balance_loss_clip": 1.04661083, "balance_loss_mlp": 1.01377964, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 1.962535535504141, "language_loss": 0.61988747, "learning_rate": 5.882672811286813e-07, "loss": 0.64173579, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 2.4576311111450195 }, { "auxiliary_loss_clip": 0.01165076, "auxiliary_loss_mlp": 0.01026138, "balance_loss_clip": 1.04708529, "balance_loss_mlp": 1.01926279, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 2.104774822748368, "language_loss": 0.69502455, "learning_rate": 5.877156075265166e-07, "loss": 0.71693665, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.4464166164398193 }, { "auxiliary_loss_clip": 0.01133816, "auxiliary_loss_mlp": 0.01024606, "balance_loss_clip": 1.04246712, "balance_loss_mlp": 1.01738501, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 2.508183112578675, "language_loss": 0.69937485, "learning_rate": 5.871641481667715e-07, "loss": 0.72095907, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 2.4712486267089844 }, { "auxiliary_loss_clip": 0.01113823, "auxiliary_loss_mlp": 0.01028131, "balance_loss_clip": 1.04342628, "balance_loss_mlp": 1.02138412, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 1.632630195196061, "language_loss": 0.84287155, "learning_rate": 5.866129031331011e-07, "loss": 0.86429107, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.606428384780884 }, { "auxiliary_loss_clip": 0.01139191, "auxiliary_loss_mlp": 0.010253, "balance_loss_clip": 1.04476643, "balance_loss_mlp": 1.01835346, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 2.1545124764189643, "language_loss": 0.83329141, "learning_rate": 5.8606187250913e-07, "loss": 0.85493636, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 3.309645891189575 }, { "auxiliary_loss_clip": 0.01150203, "auxiliary_loss_mlp": 0.00760916, "balance_loss_clip": 1.04874969, "balance_loss_mlp": 1.00027537, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 1.7278152983801374, "language_loss": 0.83810246, "learning_rate": 5.855110563784482e-07, "loss": 0.85721362, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 3.354339122772217 }, { "auxiliary_loss_clip": 0.01145205, "auxiliary_loss_mlp": 0.00760321, "balance_loss_clip": 1.044981, "balance_loss_mlp": 1.00023484, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 1.589729935439796, "language_loss": 0.6412127, "learning_rate": 5.849604548246156e-07, "loss": 0.66026807, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 3.2500948905944824 }, { "auxiliary_loss_clip": 0.01144256, "auxiliary_loss_mlp": 0.00760354, "balance_loss_clip": 1.04901731, "balance_loss_mlp": 1.00026512, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 2.8099834376029573, "language_loss": 0.80317807, "learning_rate": 5.844100679311565e-07, "loss": 0.82222414, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 2.5195209980010986 }, { "auxiliary_loss_clip": 0.01137837, "auxiliary_loss_mlp": 0.01026345, "balance_loss_clip": 1.04792738, "balance_loss_mlp": 1.01885259, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 1.9811602822511167, "language_loss": 0.76441038, "learning_rate": 5.838598957815637e-07, "loss": 0.78605223, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.485959053039551 }, { "auxiliary_loss_clip": 0.01131239, "auxiliary_loss_mlp": 0.01025293, "balance_loss_clip": 1.04603338, "balance_loss_mlp": 1.01806355, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 1.5234044348824602, "language_loss": 0.85359859, "learning_rate": 5.833099384592996e-07, "loss": 0.87516397, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.56187105178833 }, { "auxiliary_loss_clip": 0.01135168, "auxiliary_loss_mlp": 0.01023676, "balance_loss_clip": 1.04680026, "balance_loss_mlp": 1.01715231, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.4332544721032487, "language_loss": 0.71617883, "learning_rate": 5.827601960477913e-07, "loss": 0.73776728, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.515650987625122 }, { "auxiliary_loss_clip": 0.01147095, "auxiliary_loss_mlp": 0.01024441, "balance_loss_clip": 1.04476774, "balance_loss_mlp": 1.01767612, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 1.693623405581964, "language_loss": 0.70603848, "learning_rate": 5.822106686304344e-07, "loss": 0.72775388, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.5034329891204834 }, { "auxiliary_loss_clip": 0.01127908, "auxiliary_loss_mlp": 0.01028416, "balance_loss_clip": 1.04246116, "balance_loss_mlp": 1.02186823, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 1.592147231226397, "language_loss": 0.57802141, "learning_rate": 5.816613562905919e-07, "loss": 0.59958458, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.633861780166626 }, { "auxiliary_loss_clip": 0.01115066, "auxiliary_loss_mlp": 0.01023375, "balance_loss_clip": 1.04187584, "balance_loss_mlp": 1.0167979, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.5438072518692956, "language_loss": 0.69804072, "learning_rate": 5.811122591115933e-07, "loss": 0.7194252, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.6755828857421875 }, { "auxiliary_loss_clip": 0.01122856, "auxiliary_loss_mlp": 0.01030186, "balance_loss_clip": 1.04777932, "balance_loss_mlp": 1.02328658, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.5264297523187498, "language_loss": 0.71434706, "learning_rate": 5.805633771767376e-07, "loss": 0.73587751, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.5452022552490234 }, { "auxiliary_loss_clip": 0.01129412, "auxiliary_loss_mlp": 0.01025055, "balance_loss_clip": 1.0448513, "balance_loss_mlp": 1.01800096, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 1.6248246600704532, "language_loss": 0.77555394, "learning_rate": 5.800147105692888e-07, "loss": 0.79709864, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.545656681060791 }, { "auxiliary_loss_clip": 0.01149355, "auxiliary_loss_mlp": 0.01024289, "balance_loss_clip": 1.04283464, "balance_loss_mlp": 1.01746464, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 1.828367730074388, "language_loss": 0.79011232, "learning_rate": 5.794662593724795e-07, "loss": 0.81184876, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 2.4788119792938232 }, { "auxiliary_loss_clip": 0.01165493, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.04854155, "balance_loss_mlp": 1.02113903, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 1.8858461267105502, "language_loss": 0.74803972, "learning_rate": 5.789180236695091e-07, "loss": 0.76997912, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.450998306274414 }, { "auxiliary_loss_clip": 0.01145612, "auxiliary_loss_mlp": 0.01021866, "balance_loss_clip": 1.04615116, "balance_loss_mlp": 1.01557136, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 1.9491831132253836, "language_loss": 0.84868789, "learning_rate": 5.78370003543544e-07, "loss": 0.87036264, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 2.461963176727295 }, { "auxiliary_loss_clip": 0.01151503, "auxiliary_loss_mlp": 0.0076076, "balance_loss_clip": 1.04723406, "balance_loss_mlp": 1.00026166, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 1.9028909066499755, "language_loss": 0.8385517, "learning_rate": 5.778221990777203e-07, "loss": 0.85767436, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 3.25311541557312 }, { "auxiliary_loss_clip": 0.0113993, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.04780293, "balance_loss_mlp": 1.01988387, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 1.9740704353826688, "language_loss": 0.82804322, "learning_rate": 5.772746103551372e-07, "loss": 0.84971064, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 2.5311686992645264 }, { "auxiliary_loss_clip": 0.0113502, "auxiliary_loss_mlp": 0.01024409, "balance_loss_clip": 1.04699397, "balance_loss_mlp": 1.0178194, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 1.8355363712275725, "language_loss": 0.71621025, "learning_rate": 5.767272374588648e-07, "loss": 0.73780453, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.6082472801208496 }, { "auxiliary_loss_clip": 0.01148553, "auxiliary_loss_mlp": 0.01024153, "balance_loss_clip": 1.04734182, "balance_loss_mlp": 1.01753116, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 2.526937294712178, "language_loss": 0.77912152, "learning_rate": 5.76180080471939e-07, "loss": 0.8008486, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.638488292694092 }, { "auxiliary_loss_clip": 0.01167824, "auxiliary_loss_mlp": 0.01029037, "balance_loss_clip": 1.04903936, "balance_loss_mlp": 1.02145529, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 1.9725976441731259, "language_loss": 0.71721423, "learning_rate": 5.756331394773631e-07, "loss": 0.73918283, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.4152064323425293 }, { "auxiliary_loss_clip": 0.01093858, "auxiliary_loss_mlp": 0.00760429, "balance_loss_clip": 1.04124546, "balance_loss_mlp": 1.00022244, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 1.630465295060502, "language_loss": 0.75609148, "learning_rate": 5.750864145581071e-07, "loss": 0.77463436, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.6289210319519043 }, { "auxiliary_loss_clip": 0.0116485, "auxiliary_loss_mlp": 0.01031225, "balance_loss_clip": 1.0489186, "balance_loss_mlp": 1.0245254, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 1.8547230341334604, "language_loss": 0.86334121, "learning_rate": 5.745399057971085e-07, "loss": 0.88530195, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.516059637069702 }, { "auxiliary_loss_clip": 0.01153779, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.04607236, "balance_loss_mlp": 1.02174115, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.046105799644153, "language_loss": 0.74953353, "learning_rate": 5.739936132772738e-07, "loss": 0.77136272, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 2.466679334640503 }, { "auxiliary_loss_clip": 0.01162807, "auxiliary_loss_mlp": 0.01027729, "balance_loss_clip": 1.04645646, "balance_loss_mlp": 1.01990259, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 1.837311660323439, "language_loss": 0.74341613, "learning_rate": 5.734475370814733e-07, "loss": 0.76532149, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 2.4814727306365967 }, { "auxiliary_loss_clip": 0.01149471, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.04361558, "balance_loss_mlp": 1.01862383, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 1.5007692699439728, "language_loss": 0.78543818, "learning_rate": 5.729016772925483e-07, "loss": 0.80718863, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 2.5236868858337402 }, { "auxiliary_loss_clip": 0.01104179, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.04266405, "balance_loss_mlp": 1.02177191, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 1.7043596616014984, "language_loss": 0.70684195, "learning_rate": 5.723560339933038e-07, "loss": 0.72818422, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.6087496280670166 }, { "auxiliary_loss_clip": 0.01150247, "auxiliary_loss_mlp": 0.00760619, "balance_loss_clip": 1.04589593, "balance_loss_mlp": 1.00025296, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 2.193203560848772, "language_loss": 0.64761871, "learning_rate": 5.71810607266513e-07, "loss": 0.66672736, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 3.307513952255249 }, { "auxiliary_loss_clip": 0.0115149, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.04637814, "balance_loss_mlp": 1.02067232, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 1.810525429734524, "language_loss": 0.60192609, "learning_rate": 5.712653971949184e-07, "loss": 0.62371773, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 3.2753994464874268 }, { "auxiliary_loss_clip": 0.01145787, "auxiliary_loss_mlp": 0.01024434, "balance_loss_clip": 1.04491234, "balance_loss_mlp": 1.01679873, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 2.525304684920854, "language_loss": 0.75163108, "learning_rate": 5.707204038612268e-07, "loss": 0.77333331, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 3.2129251956939697 }, { "auxiliary_loss_clip": 0.01148692, "auxiliary_loss_mlp": 0.01033543, "balance_loss_clip": 1.05229187, "balance_loss_mlp": 1.02534127, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 2.737354611202029, "language_loss": 0.74038863, "learning_rate": 5.701756273481138e-07, "loss": 0.76221097, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 2.5178442001342773 }, { "auxiliary_loss_clip": 0.01135197, "auxiliary_loss_mlp": 0.01025799, "balance_loss_clip": 1.04357374, "balance_loss_mlp": 1.01859617, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.4728384510788137, "language_loss": 0.73859912, "learning_rate": 5.696310677382212e-07, "loss": 0.76020908, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.5391077995300293 }, { "auxiliary_loss_clip": 0.01023816, "auxiliary_loss_mlp": 0.01001389, "balance_loss_clip": 1.01029754, "balance_loss_mlp": 1.00031602, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8852852776416007, "language_loss": 0.61770189, "learning_rate": 5.690867251141576e-07, "loss": 0.63795394, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 3.3051459789276123 }, { "auxiliary_loss_clip": 0.0115406, "auxiliary_loss_mlp": 0.01030604, "balance_loss_clip": 1.04642272, "balance_loss_mlp": 1.02287948, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 2.013891165056972, "language_loss": 0.91489697, "learning_rate": 5.685425995585013e-07, "loss": 0.93674356, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.5030243396759033 }, { "auxiliary_loss_clip": 0.01038565, "auxiliary_loss_mlp": 0.01004163, "balance_loss_clip": 1.00948453, "balance_loss_mlp": 1.00301254, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 0.761871995392517, "language_loss": 0.59058797, "learning_rate": 5.679986911537935e-07, "loss": 0.6110152, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.2729945182800293 }, { "auxiliary_loss_clip": 0.01094266, "auxiliary_loss_mlp": 0.01028241, "balance_loss_clip": 1.03996325, "balance_loss_mlp": 1.02121639, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 1.7062527960777727, "language_loss": 0.67029548, "learning_rate": 5.674549999825462e-07, "loss": 0.69152051, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.6852099895477295 }, { "auxiliary_loss_clip": 0.01047657, "auxiliary_loss_mlp": 0.01000421, "balance_loss_clip": 1.00912833, "balance_loss_mlp": 0.99926436, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 0.9196568058139132, "language_loss": 0.71454853, "learning_rate": 5.669115261272363e-07, "loss": 0.73502934, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.0886387825012207 }, { "auxiliary_loss_clip": 0.01154618, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.04832792, "balance_loss_mlp": 1.02197337, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 2.22525732197616, "language_loss": 0.72213817, "learning_rate": 5.663682696703081e-07, "loss": 0.74397391, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.4654006958007812 }, { "auxiliary_loss_clip": 0.01163705, "auxiliary_loss_mlp": 0.0102863, "balance_loss_clip": 1.04815042, "balance_loss_mlp": 1.02183759, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 1.7134878725615768, "language_loss": 0.81967413, "learning_rate": 5.658252306941746e-07, "loss": 0.8415975, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.428879737854004 }, { "auxiliary_loss_clip": 0.01112562, "auxiliary_loss_mlp": 0.01024496, "balance_loss_clip": 1.04442656, "balance_loss_mlp": 1.01694393, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 2.211111597295149, "language_loss": 0.75053298, "learning_rate": 5.65282409281212e-07, "loss": 0.77190357, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 2.55572772026062 }, { "auxiliary_loss_clip": 0.01132741, "auxiliary_loss_mlp": 0.01031499, "balance_loss_clip": 1.04439962, "balance_loss_mlp": 1.02465641, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 2.4967145260602126, "language_loss": 0.69746745, "learning_rate": 5.64739805513768e-07, "loss": 0.71910977, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 2.503326654434204 }, { "auxiliary_loss_clip": 0.01045985, "auxiliary_loss_mlp": 0.00751267, "balance_loss_clip": 1.01040137, "balance_loss_mlp": 1.00007582, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7880571559918096, "language_loss": 0.55737901, "learning_rate": 5.641974194741541e-07, "loss": 0.57535148, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 2.9996328353881836 }, { "auxiliary_loss_clip": 0.01027927, "auxiliary_loss_mlp": 0.01002374, "balance_loss_clip": 1.01209581, "balance_loss_mlp": 1.00121176, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7751933052684012, "language_loss": 0.63717395, "learning_rate": 5.636552512446502e-07, "loss": 0.65747696, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.744497776031494 }, { "auxiliary_loss_clip": 0.01145351, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.04606199, "balance_loss_mlp": 1.02323604, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 1.672086263401076, "language_loss": 0.78005278, "learning_rate": 5.631133009075027e-07, "loss": 0.80181062, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.5267655849456787 }, { "auxiliary_loss_clip": 0.01150873, "auxiliary_loss_mlp": 0.00760188, "balance_loss_clip": 1.04708362, "balance_loss_mlp": 1.00023913, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 1.8070997533263713, "language_loss": 0.68233848, "learning_rate": 5.625715685449242e-07, "loss": 0.7014491, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 2.470076322555542 }, { "auxiliary_loss_clip": 0.01117611, "auxiliary_loss_mlp": 0.01030445, "balance_loss_clip": 1.04778266, "balance_loss_mlp": 1.02396631, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 1.5351551921640936, "language_loss": 0.71685427, "learning_rate": 5.620300542390966e-07, "loss": 0.73833477, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.5825188159942627 }, { "auxiliary_loss_clip": 0.01130849, "auxiliary_loss_mlp": 0.01023019, "balance_loss_clip": 1.04455829, "balance_loss_mlp": 1.01618195, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 1.8676049612498908, "language_loss": 0.85095334, "learning_rate": 5.614887580721659e-07, "loss": 0.87249207, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.525118827819824 }, { "auxiliary_loss_clip": 0.01119682, "auxiliary_loss_mlp": 0.01031538, "balance_loss_clip": 1.04714942, "balance_loss_mlp": 1.02476144, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 2.322787147383839, "language_loss": 0.73517859, "learning_rate": 5.609476801262481e-07, "loss": 0.75669074, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.5189685821533203 }, { "auxiliary_loss_clip": 0.01119906, "auxiliary_loss_mlp": 0.01025671, "balance_loss_clip": 1.04485798, "balance_loss_mlp": 1.0186646, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 2.1162797310752524, "language_loss": 0.64066082, "learning_rate": 5.604068204834223e-07, "loss": 0.66211659, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.4946060180664062 }, { "auxiliary_loss_clip": 0.01107, "auxiliary_loss_mlp": 0.00761224, "balance_loss_clip": 1.04369164, "balance_loss_mlp": 1.00022507, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 1.99258828786321, "language_loss": 0.76383305, "learning_rate": 5.598661792257367e-07, "loss": 0.78251529, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 2.5658271312713623 }, { "auxiliary_loss_clip": 0.01148331, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.04543245, "balance_loss_mlp": 1.01850557, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 1.8836035393487447, "language_loss": 0.75830019, "learning_rate": 5.593257564352071e-07, "loss": 0.78003883, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 2.4694929122924805 }, { "auxiliary_loss_clip": 0.01148368, "auxiliary_loss_mlp": 0.01028685, "balance_loss_clip": 1.04680407, "balance_loss_mlp": 1.02203321, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 1.8500280001909801, "language_loss": 0.75650716, "learning_rate": 5.58785552193815e-07, "loss": 0.77827764, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 2.4983203411102295 }, { "auxiliary_loss_clip": 0.01162843, "auxiliary_loss_mlp": 0.0102377, "balance_loss_clip": 1.04646146, "balance_loss_mlp": 1.01725793, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 1.7762796062646888, "language_loss": 0.75394315, "learning_rate": 5.582455665835086e-07, "loss": 0.77580929, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.498189687728882 }, { "auxiliary_loss_clip": 0.01143102, "auxiliary_loss_mlp": 0.01029628, "balance_loss_clip": 1.04329622, "balance_loss_mlp": 1.02202868, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 3.0252774209669013, "language_loss": 0.7263214, "learning_rate": 5.577057996862036e-07, "loss": 0.74804866, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 3.277927875518799 }, { "auxiliary_loss_clip": 0.01160441, "auxiliary_loss_mlp": 0.01023693, "balance_loss_clip": 1.04765308, "balance_loss_mlp": 1.01713037, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 1.5014443500598753, "language_loss": 0.7594313, "learning_rate": 5.571662515837814e-07, "loss": 0.78127265, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 3.310213327407837 }, { "auxiliary_loss_clip": 0.01135589, "auxiliary_loss_mlp": 0.01022655, "balance_loss_clip": 1.04500389, "balance_loss_mlp": 1.01610172, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 1.5560494706296821, "language_loss": 0.83729237, "learning_rate": 5.566269223580926e-07, "loss": 0.8588748, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 2.6533987522125244 }, { "auxiliary_loss_clip": 0.01151741, "auxiliary_loss_mlp": 0.010273, "balance_loss_clip": 1.04584289, "balance_loss_mlp": 1.02080607, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.595168008154554, "language_loss": 0.75391185, "learning_rate": 5.560878120909511e-07, "loss": 0.7757023, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 2.546997308731079 }, { "auxiliary_loss_clip": 0.0104791, "auxiliary_loss_mlp": 0.01004474, "balance_loss_clip": 1.0091083, "balance_loss_mlp": 1.00334167, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 0.8410175480921163, "language_loss": 0.58633167, "learning_rate": 5.55548920864141e-07, "loss": 0.60685551, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.1071739196777344 }, { "auxiliary_loss_clip": 0.01151129, "auxiliary_loss_mlp": 0.01028929, "balance_loss_clip": 1.05033982, "balance_loss_mlp": 1.02227736, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 1.798471801209253, "language_loss": 0.78148651, "learning_rate": 5.550102487594113e-07, "loss": 0.80328715, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.4796981811523438 }, { "auxiliary_loss_clip": 0.01110068, "auxiliary_loss_mlp": 0.00760653, "balance_loss_clip": 1.04043889, "balance_loss_mlp": 1.00023746, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 1.5610727290281945, "language_loss": 0.71611345, "learning_rate": 5.54471795858477e-07, "loss": 0.73482066, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.64902925491333 }, { "auxiliary_loss_clip": 0.01120297, "auxiliary_loss_mlp": 0.0102601, "balance_loss_clip": 1.03970838, "balance_loss_mlp": 1.01913714, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 2.244122700963788, "language_loss": 0.82912391, "learning_rate": 5.539335622430235e-07, "loss": 0.85058701, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.552835702896118 }, { "auxiliary_loss_clip": 0.01144076, "auxiliary_loss_mlp": 0.01024331, "balance_loss_clip": 1.04372573, "balance_loss_mlp": 1.01736903, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 1.9392747431724207, "language_loss": 0.74648976, "learning_rate": 5.533955479946975e-07, "loss": 0.76817387, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.4674482345581055 }, { "auxiliary_loss_clip": 0.0101505, "auxiliary_loss_mlp": 0.00750892, "balance_loss_clip": 1.01246977, "balance_loss_mlp": 1.0000124, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 0.8524677021695252, "language_loss": 0.65775782, "learning_rate": 5.528577531951173e-07, "loss": 0.67541724, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.115774393081665 }, { "auxiliary_loss_clip": 0.01137955, "auxiliary_loss_mlp": 0.01026071, "balance_loss_clip": 1.04453433, "balance_loss_mlp": 1.01964545, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 1.9828307627069413, "language_loss": 0.74076986, "learning_rate": 5.523201779258653e-07, "loss": 0.7624101, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.4971768856048584 }, { "auxiliary_loss_clip": 0.0116476, "auxiliary_loss_mlp": 0.01023252, "balance_loss_clip": 1.04731965, "balance_loss_mlp": 1.01609647, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 1.7824519658894622, "language_loss": 0.83907866, "learning_rate": 5.517828222684912e-07, "loss": 0.86095876, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.4656870365142822 }, { "auxiliary_loss_clip": 0.01037882, "auxiliary_loss_mlp": 0.01003384, "balance_loss_clip": 1.01162744, "balance_loss_mlp": 1.00228143, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.7737078642485606, "language_loss": 0.59101796, "learning_rate": 5.512456863045117e-07, "loss": 0.61143064, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.1221446990966797 }, { "auxiliary_loss_clip": 0.01162854, "auxiliary_loss_mlp": 0.01026218, "balance_loss_clip": 1.04639876, "balance_loss_mlp": 1.01883268, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 1.6368106367265947, "language_loss": 0.73981816, "learning_rate": 5.507087701154089e-07, "loss": 0.76170886, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.471106767654419 }, { "auxiliary_loss_clip": 0.01111203, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.04173875, "balance_loss_mlp": 1.02418804, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 2.8690555840516625, "language_loss": 0.75353587, "learning_rate": 5.50172073782634e-07, "loss": 0.77495778, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 2.5637259483337402 }, { "auxiliary_loss_clip": 0.01120088, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.04548073, "balance_loss_mlp": 1.02132261, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 1.7348416832205575, "language_loss": 0.87720394, "learning_rate": 5.496355973876023e-07, "loss": 0.89868456, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 3.344924211502075 }, { "auxiliary_loss_clip": 0.0111876, "auxiliary_loss_mlp": 0.00761424, "balance_loss_clip": 1.0413965, "balance_loss_mlp": 1.00020552, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 1.7847356064734612, "language_loss": 0.7109251, "learning_rate": 5.490993410116984e-07, "loss": 0.72972703, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.739509344100952 }, { "auxiliary_loss_clip": 0.01113681, "auxiliary_loss_mlp": 0.01024978, "balance_loss_clip": 1.04092908, "balance_loss_mlp": 1.01827812, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.598759116198481, "language_loss": 0.70075858, "learning_rate": 5.485633047362704e-07, "loss": 0.7221452, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.7446553707122803 }, { "auxiliary_loss_clip": 0.01169234, "auxiliary_loss_mlp": 0.01033419, "balance_loss_clip": 1.05067754, "balance_loss_mlp": 1.02575648, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 1.8301820493381575, "language_loss": 0.78455609, "learning_rate": 5.480274886426341e-07, "loss": 0.80658257, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.435459613800049 }, { "auxiliary_loss_clip": 0.01145436, "auxiliary_loss_mlp": 0.0102683, "balance_loss_clip": 1.04728246, "balance_loss_mlp": 1.01974916, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 2.07059247950138, "language_loss": 0.77812505, "learning_rate": 5.474918928120744e-07, "loss": 0.79984772, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.472855806350708 }, { "auxiliary_loss_clip": 0.01148611, "auxiliary_loss_mlp": 0.01024105, "balance_loss_clip": 1.04701924, "balance_loss_mlp": 1.01788199, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 2.513685171015111, "language_loss": 0.87263155, "learning_rate": 5.469565173258392e-07, "loss": 0.89435875, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.5022947788238525 }, { "auxiliary_loss_clip": 0.01168067, "auxiliary_loss_mlp": 0.01025572, "balance_loss_clip": 1.04753184, "balance_loss_mlp": 1.01813006, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 2.0985176289185237, "language_loss": 0.6343652, "learning_rate": 5.464213622651454e-07, "loss": 0.65630156, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.421651840209961 }, { "auxiliary_loss_clip": 0.01132184, "auxiliary_loss_mlp": 0.01028128, "balance_loss_clip": 1.0474087, "balance_loss_mlp": 1.02133036, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 1.6100169399190218, "language_loss": 0.84142804, "learning_rate": 5.458864277111753e-07, "loss": 0.86303127, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 2.566838026046753 }, { "auxiliary_loss_clip": 0.0113041, "auxiliary_loss_mlp": 0.00760415, "balance_loss_clip": 1.0428375, "balance_loss_mlp": 1.0002085, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 2.1421963015460372, "language_loss": 0.68665141, "learning_rate": 5.453517137450769e-07, "loss": 0.70555967, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.493882656097412 }, { "auxiliary_loss_clip": 0.01153817, "auxiliary_loss_mlp": 0.01022856, "balance_loss_clip": 1.04901934, "balance_loss_mlp": 1.01569784, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 1.6087903778414263, "language_loss": 0.75858778, "learning_rate": 5.448172204479684e-07, "loss": 0.7803545, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 2.502561330795288 }, { "auxiliary_loss_clip": 0.01162091, "auxiliary_loss_mlp": 0.01023018, "balance_loss_clip": 1.04639006, "balance_loss_mlp": 1.01638389, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.5765141797159277, "language_loss": 0.74712133, "learning_rate": 5.442829479009294e-07, "loss": 0.76897246, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 3.257929563522339 }, { "auxiliary_loss_clip": 0.01153073, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.04702163, "balance_loss_mlp": 1.01892543, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 2.150556942665171, "language_loss": 0.71551037, "learning_rate": 5.437488961850103e-07, "loss": 0.73730707, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.4704620838165283 }, { "auxiliary_loss_clip": 0.01107528, "auxiliary_loss_mlp": 0.01027276, "balance_loss_clip": 1.04262471, "balance_loss_mlp": 1.02087176, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.6842195202731018, "language_loss": 0.75392795, "learning_rate": 5.432150653812258e-07, "loss": 0.77527606, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 4.146621227264404 }, { "auxiliary_loss_clip": 0.01145726, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.04627621, "balance_loss_mlp": 1.01906872, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 1.9391776451888283, "language_loss": 0.82509184, "learning_rate": 5.42681455570557e-07, "loss": 0.84680718, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 2.4746603965759277 }, { "auxiliary_loss_clip": 0.01158146, "auxiliary_loss_mlp": 0.01023705, "balance_loss_clip": 1.04458928, "balance_loss_mlp": 1.0167253, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 2.1733985760190393, "language_loss": 0.64911819, "learning_rate": 5.42148066833954e-07, "loss": 0.6709367, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 2.4844486713409424 }, { "auxiliary_loss_clip": 0.0116121, "auxiliary_loss_mlp": 0.01023623, "balance_loss_clip": 1.04753733, "balance_loss_mlp": 1.01701593, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 1.8734702093360256, "language_loss": 0.7527163, "learning_rate": 5.416148992523289e-07, "loss": 0.77456456, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.444093704223633 }, { "auxiliary_loss_clip": 0.01081235, "auxiliary_loss_mlp": 0.01024634, "balance_loss_clip": 1.0426532, "balance_loss_mlp": 1.01805699, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 1.6488511126466572, "language_loss": 0.78342968, "learning_rate": 5.410819529065644e-07, "loss": 0.80448842, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.6573266983032227 }, { "auxiliary_loss_clip": 0.01107249, "auxiliary_loss_mlp": 0.01028106, "balance_loss_clip": 1.04062986, "balance_loss_mlp": 1.02103972, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 1.7749751680167105, "language_loss": 0.65039605, "learning_rate": 5.405492278775079e-07, "loss": 0.67174959, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.6509013175964355 }, { "auxiliary_loss_clip": 0.01136538, "auxiliary_loss_mlp": 0.01024178, "balance_loss_clip": 1.04371452, "balance_loss_mlp": 1.01688874, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 1.9638934708340456, "language_loss": 0.80159479, "learning_rate": 5.400167242459732e-07, "loss": 0.82320195, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.58976149559021 }, { "auxiliary_loss_clip": 0.01145794, "auxiliary_loss_mlp": 0.01024361, "balance_loss_clip": 1.04491985, "balance_loss_mlp": 1.01779866, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 1.655398959915776, "language_loss": 0.80510634, "learning_rate": 5.394844420927405e-07, "loss": 0.82680792, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.508486032485962 }, { "auxiliary_loss_clip": 0.01159465, "auxiliary_loss_mlp": 0.01024423, "balance_loss_clip": 1.04529858, "balance_loss_mlp": 1.01728237, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 2.046150424050049, "language_loss": 0.7341246, "learning_rate": 5.389523814985562e-07, "loss": 0.75596344, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.51172137260437 }, { "auxiliary_loss_clip": 0.01104347, "auxiliary_loss_mlp": 0.01024016, "balance_loss_clip": 1.040097, "balance_loss_mlp": 1.01717091, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 1.9497956210470164, "language_loss": 0.76289546, "learning_rate": 5.384205425441344e-07, "loss": 0.78417915, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.6410443782806396 }, { "auxiliary_loss_clip": 0.0113843, "auxiliary_loss_mlp": 0.01022627, "balance_loss_clip": 1.04460013, "balance_loss_mlp": 1.01595163, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 1.641148196280358, "language_loss": 0.84357321, "learning_rate": 5.378889253101537e-07, "loss": 0.86518371, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.5982351303100586 }, { "auxiliary_loss_clip": 0.01148324, "auxiliary_loss_mlp": 0.01023014, "balance_loss_clip": 1.04364121, "balance_loss_mlp": 1.0164305, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 1.720035926421481, "language_loss": 0.808752, "learning_rate": 5.373575298772617e-07, "loss": 0.83046538, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.508378267288208 }, { "auxiliary_loss_clip": 0.01047613, "auxiliary_loss_mlp": 0.01000535, "balance_loss_clip": 1.00859785, "balance_loss_mlp": 0.99942034, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7552759285509741, "language_loss": 0.61352092, "learning_rate": 5.368263563260689e-07, "loss": 0.63400239, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.1872360706329346 }, { "auxiliary_loss_clip": 0.01148478, "auxiliary_loss_mlp": 0.010237, "balance_loss_clip": 1.04422522, "balance_loss_mlp": 1.01702976, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 1.4949110052754477, "language_loss": 0.64542645, "learning_rate": 5.362954047371537e-07, "loss": 0.66714823, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 2.482348918914795 }, { "auxiliary_loss_clip": 0.01121485, "auxiliary_loss_mlp": 0.01024868, "balance_loss_clip": 1.04714584, "balance_loss_mlp": 1.01733446, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 1.8230751769463514, "language_loss": 0.71946347, "learning_rate": 5.357646751910627e-07, "loss": 0.74092698, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 3.3394582271575928 }, { "auxiliary_loss_clip": 0.01131201, "auxiliary_loss_mlp": 0.01022773, "balance_loss_clip": 1.0419805, "balance_loss_mlp": 1.01556396, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 2.0285053753777924, "language_loss": 0.79819858, "learning_rate": 5.352341677683061e-07, "loss": 0.81973827, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.549882411956787 }, { "auxiliary_loss_clip": 0.01128285, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.04348516, "balance_loss_mlp": 1.02056003, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 1.8587208202714947, "language_loss": 0.78894484, "learning_rate": 5.347038825493617e-07, "loss": 0.81050175, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.5968708992004395 }, { "auxiliary_loss_clip": 0.0113332, "auxiliary_loss_mlp": 0.01025156, "balance_loss_clip": 1.04653466, "balance_loss_mlp": 1.01850414, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.089386710532059, "language_loss": 0.6866132, "learning_rate": 5.341738196146732e-07, "loss": 0.70819795, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.532658338546753 }, { "auxiliary_loss_clip": 0.01144332, "auxiliary_loss_mlp": 0.01022543, "balance_loss_clip": 1.04212439, "balance_loss_mlp": 1.01548553, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.105038495461312, "language_loss": 0.73342848, "learning_rate": 5.336439790446503e-07, "loss": 0.75509727, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.65083384513855 }, { "auxiliary_loss_clip": 0.01117601, "auxiliary_loss_mlp": 0.01023711, "balance_loss_clip": 1.04098105, "balance_loss_mlp": 1.01704383, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 1.8633060771115304, "language_loss": 0.63061708, "learning_rate": 5.331143609196711e-07, "loss": 0.65203017, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 2.9449517726898193 }, { "auxiliary_loss_clip": 0.01150394, "auxiliary_loss_mlp": 0.01025051, "balance_loss_clip": 1.04847085, "balance_loss_mlp": 1.01824379, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 1.9527778582427457, "language_loss": 0.77427357, "learning_rate": 5.325849653200758e-07, "loss": 0.79602802, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.621087074279785 }, { "auxiliary_loss_clip": 0.01164048, "auxiliary_loss_mlp": 0.01027977, "balance_loss_clip": 1.04842067, "balance_loss_mlp": 1.02089012, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 1.6423741286904339, "language_loss": 0.76109594, "learning_rate": 5.32055792326175e-07, "loss": 0.7830162, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 2.490973949432373 }, { "auxiliary_loss_clip": 0.01138871, "auxiliary_loss_mlp": 0.01025488, "balance_loss_clip": 1.0470525, "balance_loss_mlp": 1.01831794, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 1.8466481262118786, "language_loss": 0.72717899, "learning_rate": 5.315268420182437e-07, "loss": 0.74882263, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.544527530670166 }, { "auxiliary_loss_clip": 0.01129093, "auxiliary_loss_mlp": 0.00760376, "balance_loss_clip": 1.04442906, "balance_loss_mlp": 1.00022554, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 2.232961975106676, "language_loss": 0.76262093, "learning_rate": 5.309981144765221e-07, "loss": 0.7815156, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 2.627885580062866 }, { "auxiliary_loss_clip": 0.01114566, "auxiliary_loss_mlp": 0.01027527, "balance_loss_clip": 1.04159391, "balance_loss_mlp": 1.02029145, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 2.0256091307156545, "language_loss": 0.75303411, "learning_rate": 5.304696097812196e-07, "loss": 0.77445507, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 3.3409650325775146 }, { "auxiliary_loss_clip": 0.01134182, "auxiliary_loss_mlp": 0.01032328, "balance_loss_clip": 1.04534316, "balance_loss_mlp": 1.02459764, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 2.735328713060162, "language_loss": 0.60040563, "learning_rate": 5.299413280125078e-07, "loss": 0.62207079, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.5599632263183594 }, { "auxiliary_loss_clip": 0.01136219, "auxiliary_loss_mlp": 0.01023727, "balance_loss_clip": 1.04343772, "balance_loss_mlp": 1.01704824, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 1.8590350838044205, "language_loss": 0.72395968, "learning_rate": 5.294132692505284e-07, "loss": 0.74555922, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 4.044933795928955 }, { "auxiliary_loss_clip": 0.01097687, "auxiliary_loss_mlp": 0.01029957, "balance_loss_clip": 1.03825092, "balance_loss_mlp": 1.02313209, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 1.8326223960576682, "language_loss": 0.79349428, "learning_rate": 5.288854335753861e-07, "loss": 0.8147707, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 2.559601306915283 }, { "auxiliary_loss_clip": 0.01152628, "auxiliary_loss_mlp": 0.01020785, "balance_loss_clip": 1.04664242, "balance_loss_mlp": 1.01414835, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 1.5924572349572663, "language_loss": 0.75516945, "learning_rate": 5.283578210671551e-07, "loss": 0.77690351, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 2.58896803855896 }, { "auxiliary_loss_clip": 0.01140248, "auxiliary_loss_mlp": 0.01021549, "balance_loss_clip": 1.04620063, "balance_loss_mlp": 1.01546907, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 1.901077472047307, "language_loss": 0.76306444, "learning_rate": 5.278304318058719e-07, "loss": 0.78468239, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.5181946754455566 }, { "auxiliary_loss_clip": 0.01091703, "auxiliary_loss_mlp": 0.01026798, "balance_loss_clip": 1.03870416, "balance_loss_mlp": 1.01963329, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 1.843010964209276, "language_loss": 0.78982174, "learning_rate": 5.273032658715411e-07, "loss": 0.81100667, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 2.7308335304260254 }, { "auxiliary_loss_clip": 0.01105775, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.04184008, "balance_loss_mlp": 1.02145231, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 2.0453285610750833, "language_loss": 0.76972187, "learning_rate": 5.267763233441347e-07, "loss": 0.791062, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.65564227104187 }, { "auxiliary_loss_clip": 0.01153403, "auxiliary_loss_mlp": 0.01029903, "balance_loss_clip": 1.04801106, "balance_loss_mlp": 1.02208638, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 2.0551105671964325, "language_loss": 0.69387698, "learning_rate": 5.26249604303588e-07, "loss": 0.71571004, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.5191636085510254 }, { "auxiliary_loss_clip": 0.01164509, "auxiliary_loss_mlp": 0.01025869, "balance_loss_clip": 1.04845619, "balance_loss_mlp": 1.01937795, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 2.0448114769913928, "language_loss": 0.78078723, "learning_rate": 5.257231088298057e-07, "loss": 0.80269098, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.4659805297851562 }, { "auxiliary_loss_clip": 0.0102258, "auxiliary_loss_mlp": 0.01003078, "balance_loss_clip": 1.00798464, "balance_loss_mlp": 1.00195754, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.8182572661422302, "language_loss": 0.54007554, "learning_rate": 5.25196837002655e-07, "loss": 0.56033218, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.186418294906616 }, { "auxiliary_loss_clip": 0.01132906, "auxiliary_loss_mlp": 0.0103217, "balance_loss_clip": 1.04477417, "balance_loss_mlp": 1.02481794, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 2.1684231501459577, "language_loss": 0.68583512, "learning_rate": 5.24670788901971e-07, "loss": 0.7074858, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.699737787246704 }, { "auxiliary_loss_clip": 0.01136456, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.04768336, "balance_loss_mlp": 1.02450204, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 2.066212159095324, "language_loss": 0.6813333, "learning_rate": 5.241449646075557e-07, "loss": 0.70302093, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.651027202606201 }, { "auxiliary_loss_clip": 0.01156198, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.04643333, "balance_loss_mlp": 1.02149129, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.0726864616829515, "language_loss": 0.72804213, "learning_rate": 5.236193641991762e-07, "loss": 0.7498908, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.5300230979919434 }, { "auxiliary_loss_clip": 0.0113531, "auxiliary_loss_mlp": 0.01025471, "balance_loss_clip": 1.04646349, "balance_loss_mlp": 1.01851773, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 1.7511401401440978, "language_loss": 0.69727266, "learning_rate": 5.23093987756565e-07, "loss": 0.71888053, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.557650566101074 }, { "auxiliary_loss_clip": 0.01126206, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.03991628, "balance_loss_mlp": 1.02537763, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 1.658660304565217, "language_loss": 0.75392139, "learning_rate": 5.225688353594217e-07, "loss": 0.77551079, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 2.6053831577301025 }, { "auxiliary_loss_clip": 0.01143042, "auxiliary_loss_mlp": 0.00760549, "balance_loss_clip": 1.04745722, "balance_loss_mlp": 1.00021398, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 2.103647854230917, "language_loss": 0.77558756, "learning_rate": 5.220439070874108e-07, "loss": 0.79462343, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 2.558582067489624 }, { "auxiliary_loss_clip": 0.0115341, "auxiliary_loss_mlp": 0.01022986, "balance_loss_clip": 1.04994786, "balance_loss_mlp": 1.01620269, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 2.7691769409386744, "language_loss": 0.7079711, "learning_rate": 5.215192030201652e-07, "loss": 0.72973508, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 3.346935987472534 }, { "auxiliary_loss_clip": 0.01109081, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.03940547, "balance_loss_mlp": 1.02097869, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 1.7082350388563132, "language_loss": 0.86181515, "learning_rate": 5.209947232372798e-07, "loss": 0.88318324, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.585092306137085 }, { "auxiliary_loss_clip": 0.01153634, "auxiliary_loss_mlp": 0.00760601, "balance_loss_clip": 1.04614663, "balance_loss_mlp": 1.00023079, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 1.6230721827542016, "language_loss": 0.81024241, "learning_rate": 5.204704678183196e-07, "loss": 0.82938474, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.582650899887085 }, { "auxiliary_loss_clip": 0.01166626, "auxiliary_loss_mlp": 0.01022434, "balance_loss_clip": 1.04990768, "balance_loss_mlp": 1.01556134, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 1.7361202300143974, "language_loss": 0.84869528, "learning_rate": 5.19946436842813e-07, "loss": 0.87058592, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 2.447450876235962 }, { "auxiliary_loss_clip": 0.01119401, "auxiliary_loss_mlp": 0.01023831, "balance_loss_clip": 1.04255271, "balance_loss_mlp": 1.01756048, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.648709464697266, "language_loss": 0.68407154, "learning_rate": 5.194226303902546e-07, "loss": 0.70550382, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.706016778945923 }, { "auxiliary_loss_clip": 0.01134844, "auxiliary_loss_mlp": 0.01023108, "balance_loss_clip": 1.04401445, "balance_loss_mlp": 1.01582432, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 1.7802655526290958, "language_loss": 0.7099728, "learning_rate": 5.188990485401072e-07, "loss": 0.73155236, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.542248249053955 }, { "auxiliary_loss_clip": 0.01151473, "auxiliary_loss_mlp": 0.01023799, "balance_loss_clip": 1.04828477, "balance_loss_mlp": 1.0168308, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 1.745704902969292, "language_loss": 0.86208785, "learning_rate": 5.183756913717954e-07, "loss": 0.88384056, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 2.5610287189483643 }, { "auxiliary_loss_clip": 0.01133684, "auxiliary_loss_mlp": 0.01029883, "balance_loss_clip": 1.04556513, "balance_loss_mlp": 1.02280152, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 1.8344887933971807, "language_loss": 0.72737086, "learning_rate": 5.178525589647136e-07, "loss": 0.74900651, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.6384189128875732 }, { "auxiliary_loss_clip": 0.01140859, "auxiliary_loss_mlp": 0.01025843, "balance_loss_clip": 1.0448662, "balance_loss_mlp": 1.0193758, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 1.9843517447337005, "language_loss": 0.78756529, "learning_rate": 5.173296513982197e-07, "loss": 0.80923229, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 2.556851625442505 }, { "auxiliary_loss_clip": 0.01132659, "auxiliary_loss_mlp": 0.01027297, "balance_loss_clip": 1.04557729, "balance_loss_mlp": 1.01999497, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 1.8882902076952024, "language_loss": 0.64927369, "learning_rate": 5.168069687516398e-07, "loss": 0.67087322, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 3.4023709297180176 }, { "auxiliary_loss_clip": 0.01138508, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.04870272, "balance_loss_mlp": 1.01983047, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 2.74337789418084, "language_loss": 0.7161448, "learning_rate": 5.16284511104263e-07, "loss": 0.73779905, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 2.5161678791046143 }, { "auxiliary_loss_clip": 0.01134852, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.0449475, "balance_loss_mlp": 1.02482438, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 3.0284045482112756, "language_loss": 0.80839634, "learning_rate": 5.157622785353457e-07, "loss": 0.83006638, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 4.072638034820557 }, { "auxiliary_loss_clip": 0.01047942, "auxiliary_loss_mlp": 0.01000345, "balance_loss_clip": 1.00930727, "balance_loss_mlp": 0.99923021, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6432247736178279, "language_loss": 0.60389096, "learning_rate": 5.152402711241113e-07, "loss": 0.62437385, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.1827900409698486 }, { "auxiliary_loss_clip": 0.01118437, "auxiliary_loss_mlp": 0.01028451, "balance_loss_clip": 1.04126477, "balance_loss_mlp": 1.02193618, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 1.5558118230906002, "language_loss": 0.82793123, "learning_rate": 5.147184889497465e-07, "loss": 0.84940004, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 2.604698419570923 }, { "auxiliary_loss_clip": 0.01114735, "auxiliary_loss_mlp": 0.01025324, "balance_loss_clip": 1.04282355, "balance_loss_mlp": 1.0177896, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 2.3935628858707276, "language_loss": 0.80139041, "learning_rate": 5.141969320914072e-07, "loss": 0.82279098, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.531937837600708 }, { "auxiliary_loss_clip": 0.01165549, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.04681587, "balance_loss_mlp": 1.02007949, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 6.152047815374636, "language_loss": 0.62326634, "learning_rate": 5.136756006282113e-07, "loss": 0.64520478, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.569178342819214 }, { "auxiliary_loss_clip": 0.0116895, "auxiliary_loss_mlp": 0.01038238, "balance_loss_clip": 1.05102611, "balance_loss_mlp": 1.03064442, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 2.2641667986546175, "language_loss": 0.85234588, "learning_rate": 5.131544946392446e-07, "loss": 0.87441778, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.438101291656494 }, { "auxiliary_loss_clip": 0.01134354, "auxiliary_loss_mlp": 0.01028188, "balance_loss_clip": 1.04618645, "balance_loss_mlp": 1.02090132, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 1.9214723098600501, "language_loss": 0.64094543, "learning_rate": 5.126336142035592e-07, "loss": 0.66257083, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.626328229904175 }, { "auxiliary_loss_clip": 0.01135347, "auxiliary_loss_mlp": 0.01024595, "balance_loss_clip": 1.04242635, "balance_loss_mlp": 1.0176096, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 2.643458051517162, "language_loss": 0.72056687, "learning_rate": 5.121129594001721e-07, "loss": 0.74216628, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.4923996925354004 }, { "auxiliary_loss_clip": 0.01151308, "auxiliary_loss_mlp": 0.01031448, "balance_loss_clip": 1.04861772, "balance_loss_mlp": 1.02452517, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.5540147291959931, "language_loss": 0.81087315, "learning_rate": 5.115925303080661e-07, "loss": 0.83270067, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.5081732273101807 }, { "auxiliary_loss_clip": 0.01134267, "auxiliary_loss_mlp": 0.01021012, "balance_loss_clip": 1.04573989, "balance_loss_mlp": 1.01431572, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 1.8593351752264253, "language_loss": 0.79330182, "learning_rate": 5.110723270061899e-07, "loss": 0.81485462, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.489534854888916 }, { "auxiliary_loss_clip": 0.01161588, "auxiliary_loss_mlp": 0.01021291, "balance_loss_clip": 1.04709148, "balance_loss_mlp": 1.01478815, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 3.0232928796060428, "language_loss": 0.79402894, "learning_rate": 5.105523495734572e-07, "loss": 0.81585777, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.412870407104492 }, { "auxiliary_loss_clip": 0.01165723, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.04735756, "balance_loss_mlp": 1.02139211, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.5748189080194266, "language_loss": 0.75033569, "learning_rate": 5.100325980887499e-07, "loss": 0.77228022, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.46769380569458 }, { "auxiliary_loss_clip": 0.01139224, "auxiliary_loss_mlp": 0.01028942, "balance_loss_clip": 1.04587317, "balance_loss_mlp": 1.02186716, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 1.788238122188804, "language_loss": 0.82951927, "learning_rate": 5.095130726309116e-07, "loss": 0.85120094, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.532679557800293 }, { "auxiliary_loss_clip": 0.01056431, "auxiliary_loss_mlp": 0.01000065, "balance_loss_clip": 1.00878119, "balance_loss_mlp": 0.99903405, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 0.8222596346774548, "language_loss": 0.59054834, "learning_rate": 5.089937732787559e-07, "loss": 0.61111331, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 3.130455255508423 }, { "auxiliary_loss_clip": 0.01122391, "auxiliary_loss_mlp": 0.01025891, "balance_loss_clip": 1.04310584, "balance_loss_mlp": 1.01872015, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 2.13058300491253, "language_loss": 0.66755062, "learning_rate": 5.084747001110592e-07, "loss": 0.68903345, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.6195971965789795 }, { "auxiliary_loss_clip": 0.01143521, "auxiliary_loss_mlp": 0.00760612, "balance_loss_clip": 1.04632807, "balance_loss_mlp": 1.00024664, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 1.673585102441682, "language_loss": 0.70054698, "learning_rate": 5.07955853206564e-07, "loss": 0.71958828, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 3.3109371662139893 }, { "auxiliary_loss_clip": 0.01155202, "auxiliary_loss_mlp": 0.01025571, "balance_loss_clip": 1.04847431, "balance_loss_mlp": 1.01861846, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 2.2186866867521227, "language_loss": 0.71046883, "learning_rate": 5.074372326439807e-07, "loss": 0.73227656, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 2.687319278717041 }, { "auxiliary_loss_clip": 0.01122976, "auxiliary_loss_mlp": 0.01026665, "balance_loss_clip": 1.04376221, "balance_loss_mlp": 1.01953578, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 2.2778185383403025, "language_loss": 0.73152232, "learning_rate": 5.069188385019814e-07, "loss": 0.75301862, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.5269172191619873 }, { "auxiliary_loss_clip": 0.01115192, "auxiliary_loss_mlp": 0.01025683, "balance_loss_clip": 1.04189515, "balance_loss_mlp": 1.01873064, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 2.9163157008205243, "language_loss": 0.61134666, "learning_rate": 5.064006708592077e-07, "loss": 0.6327554, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.560739040374756 }, { "auxiliary_loss_clip": 0.0113244, "auxiliary_loss_mlp": 0.01024112, "balance_loss_clip": 1.0486325, "balance_loss_mlp": 1.01734102, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 2.6838110903232724, "language_loss": 0.75556868, "learning_rate": 5.058827297942641e-07, "loss": 0.77713418, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.4854915142059326 }, { "auxiliary_loss_clip": 0.01140167, "auxiliary_loss_mlp": 0.01025136, "balance_loss_clip": 1.04684615, "balance_loss_mlp": 1.01867521, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 1.8342279071406626, "language_loss": 0.75224912, "learning_rate": 5.053650153857237e-07, "loss": 0.77390218, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.524054765701294 }, { "auxiliary_loss_clip": 0.01149736, "auxiliary_loss_mlp": 0.01026742, "balance_loss_clip": 1.0473392, "balance_loss_mlp": 1.0192976, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 1.5923475685214625, "language_loss": 0.69798863, "learning_rate": 5.048475277121214e-07, "loss": 0.71975344, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 2.4632203578948975 }, { "auxiliary_loss_clip": 0.01152298, "auxiliary_loss_mlp": 0.01026285, "balance_loss_clip": 1.04740262, "balance_loss_mlp": 1.01887035, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 2.5577606357876497, "language_loss": 0.77165943, "learning_rate": 5.043302668519598e-07, "loss": 0.79344523, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.5650269985198975 }, { "auxiliary_loss_clip": 0.01152302, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.04604924, "balance_loss_mlp": 1.01990342, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 1.9637669686093535, "language_loss": 0.72143769, "learning_rate": 5.038132328837079e-07, "loss": 0.7432313, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 2.4759116172790527 }, { "auxiliary_loss_clip": 0.01150506, "auxiliary_loss_mlp": 0.01022115, "balance_loss_clip": 1.04582405, "balance_loss_mlp": 1.01563597, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 1.9868925439199594, "language_loss": 0.736148, "learning_rate": 5.032964258857993e-07, "loss": 0.75787413, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 3.3098433017730713 }, { "auxiliary_loss_clip": 0.01148161, "auxiliary_loss_mlp": 0.01025139, "balance_loss_clip": 1.04271984, "balance_loss_mlp": 1.01811719, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.4834152017900126, "language_loss": 0.68058598, "learning_rate": 5.027798459366329e-07, "loss": 0.70231897, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 3.574043035507202 }, { "auxiliary_loss_clip": 0.01153371, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.04562211, "balance_loss_mlp": 1.0199194, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 1.5264554480203447, "language_loss": 0.63518417, "learning_rate": 5.02263493114573e-07, "loss": 0.65698701, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.5310778617858887 }, { "auxiliary_loss_clip": 0.01162072, "auxiliary_loss_mlp": 0.01021934, "balance_loss_clip": 1.04678488, "balance_loss_mlp": 1.01501691, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 2.2524415144245755, "language_loss": 0.77292264, "learning_rate": 5.017473674979502e-07, "loss": 0.79476267, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 3.2398018836975098 }, { "auxiliary_loss_clip": 0.01026852, "auxiliary_loss_mlp": 0.01003774, "balance_loss_clip": 1.01798153, "balance_loss_mlp": 1.00278425, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.7440096850543245, "language_loss": 0.58359283, "learning_rate": 5.01231469165061e-07, "loss": 0.60389912, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 3.0386314392089844 }, { "auxiliary_loss_clip": 0.01047829, "auxiliary_loss_mlp": 0.01000677, "balance_loss_clip": 1.00922894, "balance_loss_mlp": 0.99959797, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.8319243547413446, "language_loss": 0.56872571, "learning_rate": 5.007157981941663e-07, "loss": 0.58921081, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 3.192441701889038 }, { "auxiliary_loss_clip": 0.01037368, "auxiliary_loss_mlp": 0.01000069, "balance_loss_clip": 1.00914693, "balance_loss_mlp": 0.9989962, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8774775426335493, "language_loss": 0.67527282, "learning_rate": 5.002003546634928e-07, "loss": 0.69564724, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.057231903076172 }, { "auxiliary_loss_clip": 0.01104617, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.04276097, "balance_loss_mlp": 1.02366793, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.700649900150202, "language_loss": 0.76038468, "learning_rate": 4.996851386512331e-07, "loss": 0.78173536, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.616244077682495 }, { "auxiliary_loss_clip": 0.01139873, "auxiliary_loss_mlp": 0.01026168, "balance_loss_clip": 1.04706526, "balance_loss_mlp": 1.01874769, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 2.3100777882831904, "language_loss": 0.83089393, "learning_rate": 4.991701502355444e-07, "loss": 0.85255432, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.5306806564331055 }, { "auxiliary_loss_clip": 0.01153385, "auxiliary_loss_mlp": 0.01021507, "balance_loss_clip": 1.04579258, "balance_loss_mlp": 1.01485848, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.4963032764702198, "language_loss": 0.75891149, "learning_rate": 4.986553894945518e-07, "loss": 0.78066039, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.51418399810791 }, { "auxiliary_loss_clip": 0.01106868, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.0425036, "balance_loss_mlp": 1.01943731, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 1.8613418117524225, "language_loss": 0.8595984, "learning_rate": 4.981408565063416e-07, "loss": 0.88093174, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.625422954559326 }, { "auxiliary_loss_clip": 0.01166157, "auxiliary_loss_mlp": 0.01024454, "balance_loss_clip": 1.04825187, "balance_loss_mlp": 1.01739943, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 1.9144009612590764, "language_loss": 0.7635411, "learning_rate": 4.976265513489701e-07, "loss": 0.78544724, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.472801923751831 }, { "auxiliary_loss_clip": 0.01150905, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 1.04550076, "balance_loss_mlp": 1.01811635, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 1.6857516502403274, "language_loss": 0.80172974, "learning_rate": 4.971124741004562e-07, "loss": 0.82349193, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.5214762687683105 }, { "auxiliary_loss_clip": 0.01148178, "auxiliary_loss_mlp": 0.01023648, "balance_loss_clip": 1.04461098, "balance_loss_mlp": 1.0169518, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 1.732008655719451, "language_loss": 0.76075304, "learning_rate": 4.965986248387846e-07, "loss": 0.7824713, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.4609971046447754 }, { "auxiliary_loss_clip": 0.01138162, "auxiliary_loss_mlp": 0.01025954, "balance_loss_clip": 1.04367948, "balance_loss_mlp": 1.0193584, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 1.7664616536605562, "language_loss": 0.76964509, "learning_rate": 4.960850036419073e-07, "loss": 0.79128623, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.551643133163452 }, { "auxiliary_loss_clip": 0.01132998, "auxiliary_loss_mlp": 0.01028477, "balance_loss_clip": 1.04431021, "balance_loss_mlp": 1.02121437, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 1.8255918376108202, "language_loss": 0.78555775, "learning_rate": 4.955716105877378e-07, "loss": 0.80717242, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 2.483099937438965 }, { "auxiliary_loss_clip": 0.01153621, "auxiliary_loss_mlp": 0.00760229, "balance_loss_clip": 1.04591572, "balance_loss_mlp": 1.00027823, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 1.6075458413453387, "language_loss": 0.82742143, "learning_rate": 4.950584457541598e-07, "loss": 0.84656, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.4863955974578857 }, { "auxiliary_loss_clip": 0.01152686, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.0468657, "balance_loss_mlp": 1.02428889, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 1.3959176170436078, "language_loss": 0.82025301, "learning_rate": 4.945455092190183e-07, "loss": 0.84208798, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 3.3291244506835938 }, { "auxiliary_loss_clip": 0.01056385, "auxiliary_loss_mlp": 0.01001529, "balance_loss_clip": 1.00851607, "balance_loss_mlp": 1.00045574, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6769566640865966, "language_loss": 0.55970228, "learning_rate": 4.940328010601271e-07, "loss": 0.58028144, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.0740015506744385 }, { "auxiliary_loss_clip": 0.01145105, "auxiliary_loss_mlp": 0.01033912, "balance_loss_clip": 1.05157852, "balance_loss_mlp": 1.02642262, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.7784367275674553, "language_loss": 0.76671809, "learning_rate": 4.935203213552621e-07, "loss": 0.7885083, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 2.732941150665283 }, { "auxiliary_loss_clip": 0.01134981, "auxiliary_loss_mlp": 0.01024802, "balance_loss_clip": 1.04382741, "balance_loss_mlp": 1.01744699, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 2.0778088142655906, "language_loss": 0.66937625, "learning_rate": 4.930080701821662e-07, "loss": 0.69097406, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.5188586711883545 }, { "auxiliary_loss_clip": 0.01137662, "auxiliary_loss_mlp": 0.01027282, "balance_loss_clip": 1.04541707, "balance_loss_mlp": 1.01996541, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 2.0099915191027966, "language_loss": 0.77219188, "learning_rate": 4.92496047618548e-07, "loss": 0.79384124, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.556781053543091 }, { "auxiliary_loss_clip": 0.01153962, "auxiliary_loss_mlp": 0.01030929, "balance_loss_clip": 1.04895759, "balance_loss_mlp": 1.02360916, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 1.8916079996294195, "language_loss": 0.77801442, "learning_rate": 4.919842537420811e-07, "loss": 0.79986334, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.5134615898132324 }, { "auxiliary_loss_clip": 0.01140752, "auxiliary_loss_mlp": 0.01032064, "balance_loss_clip": 1.04838943, "balance_loss_mlp": 1.02549255, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.8584801829312216, "language_loss": 0.78945428, "learning_rate": 4.91472688630404e-07, "loss": 0.8111825, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 2.5466794967651367 }, { "auxiliary_loss_clip": 0.01161121, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.04620206, "balance_loss_mlp": 1.02265167, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 1.5805027394379267, "language_loss": 0.73780751, "learning_rate": 4.909613523611202e-07, "loss": 0.7597115, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.469123601913452 }, { "auxiliary_loss_clip": 0.01099968, "auxiliary_loss_mlp": 0.0076102, "balance_loss_clip": 1.03942752, "balance_loss_mlp": 1.00025129, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 1.8155462318572995, "language_loss": 0.74473429, "learning_rate": 4.904502450117991e-07, "loss": 0.76334417, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 2.6692214012145996 }, { "auxiliary_loss_clip": 0.01131121, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.04614115, "balance_loss_mlp": 1.02145278, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.411159898019545, "language_loss": 0.72551942, "learning_rate": 4.899393666599762e-07, "loss": 0.74711239, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 3.3149571418762207 }, { "auxiliary_loss_clip": 0.01162186, "auxiliary_loss_mlp": 0.01024214, "balance_loss_clip": 1.0451926, "balance_loss_mlp": 1.0173502, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 1.9580919513513324, "language_loss": 0.72624886, "learning_rate": 4.894287173831506e-07, "loss": 0.74811286, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 3.145453691482544 }, { "auxiliary_loss_clip": 0.01135848, "auxiliary_loss_mlp": 0.01026356, "balance_loss_clip": 1.04243279, "balance_loss_mlp": 1.01921821, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 1.8932223203062266, "language_loss": 0.84375733, "learning_rate": 4.889182972587877e-07, "loss": 0.86537939, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.523501396179199 }, { "auxiliary_loss_clip": 0.01130948, "auxiliary_loss_mlp": 0.01027832, "balance_loss_clip": 1.04473662, "balance_loss_mlp": 1.02107906, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 9.710303848078075, "language_loss": 0.66234672, "learning_rate": 4.884081063643177e-07, "loss": 0.68393451, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 3.2897355556488037 }, { "auxiliary_loss_clip": 0.0103226, "auxiliary_loss_mlp": 0.00999927, "balance_loss_clip": 1.01141191, "balance_loss_mlp": 0.99902087, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.8621934649572839, "language_loss": 0.52566767, "learning_rate": 4.878981447771353e-07, "loss": 0.54598957, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.115163564682007 }, { "auxiliary_loss_clip": 0.01120947, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.04456258, "balance_loss_mlp": 1.02232361, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 1.6316992261830194, "language_loss": 0.72827148, "learning_rate": 4.873884125746035e-07, "loss": 0.74977648, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 2.581382989883423 }, { "auxiliary_loss_clip": 0.0113265, "auxiliary_loss_mlp": 0.01023939, "balance_loss_clip": 1.04523301, "balance_loss_mlp": 1.01695383, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 2.126933165693386, "language_loss": 0.72381508, "learning_rate": 4.868789098340456e-07, "loss": 0.745381, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.523045778274536 }, { "auxiliary_loss_clip": 0.01124488, "auxiliary_loss_mlp": 0.01025102, "balance_loss_clip": 1.04337788, "balance_loss_mlp": 1.01789618, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 2.3194994017288977, "language_loss": 0.73251039, "learning_rate": 4.863696366327543e-07, "loss": 0.75400627, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.563617706298828 }, { "auxiliary_loss_clip": 0.01150123, "auxiliary_loss_mlp": 0.01027835, "balance_loss_clip": 1.04295731, "balance_loss_mlp": 1.0208106, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 1.7333609652875686, "language_loss": 0.77841908, "learning_rate": 4.85860593047986e-07, "loss": 0.80019861, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.5248007774353027 }, { "auxiliary_loss_clip": 0.01114353, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.03809643, "balance_loss_mlp": 1.02055681, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 1.4696677239899198, "language_loss": 0.74806821, "learning_rate": 4.853517791569613e-07, "loss": 0.76948833, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.5916953086853027 }, { "auxiliary_loss_clip": 0.01140195, "auxiliary_loss_mlp": 0.0076095, "balance_loss_clip": 1.04340172, "balance_loss_mlp": 1.00026381, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 1.92825875798555, "language_loss": 0.66068298, "learning_rate": 4.848431950368684e-07, "loss": 0.67969441, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.6829874515533447 }, { "auxiliary_loss_clip": 0.01056035, "auxiliary_loss_mlp": 0.00751187, "balance_loss_clip": 1.00844383, "balance_loss_mlp": 1.00003803, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.7139268136616196, "language_loss": 0.55773413, "learning_rate": 4.843348407648569e-07, "loss": 0.57580638, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.0175840854644775 }, { "auxiliary_loss_clip": 0.01149116, "auxiliary_loss_mlp": 0.01025248, "balance_loss_clip": 1.04084373, "balance_loss_mlp": 1.01769602, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 2.4361989538748676, "language_loss": 0.82596505, "learning_rate": 4.838267164180457e-07, "loss": 0.8477087, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.4603161811828613 }, { "auxiliary_loss_clip": 0.0116711, "auxiliary_loss_mlp": 0.01027178, "balance_loss_clip": 1.04793692, "balance_loss_mlp": 1.02002573, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 1.7451426401907058, "language_loss": 0.83712161, "learning_rate": 4.833188220735156e-07, "loss": 0.85906446, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.46107816696167 }, { "auxiliary_loss_clip": 0.01150253, "auxiliary_loss_mlp": 0.01021587, "balance_loss_clip": 1.0466907, "balance_loss_mlp": 1.01471806, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 2.0679413148494663, "language_loss": 0.74675047, "learning_rate": 4.828111578083152e-07, "loss": 0.76846886, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.495389461517334 }, { "auxiliary_loss_clip": 0.01135213, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.04497671, "balance_loss_mlp": 1.01893067, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 1.832012049869721, "language_loss": 0.81131995, "learning_rate": 4.823037236994556e-07, "loss": 0.83293253, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 2.5544991493225098 }, { "auxiliary_loss_clip": 0.01046751, "auxiliary_loss_mlp": 0.01000446, "balance_loss_clip": 1.00799012, "balance_loss_mlp": 0.99933761, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.7753786716000538, "language_loss": 0.56360388, "learning_rate": 4.817965198239136e-07, "loss": 0.58407581, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.0868732929229736 }, { "auxiliary_loss_clip": 0.01119831, "auxiliary_loss_mlp": 0.01027211, "balance_loss_clip": 1.04198766, "balance_loss_mlp": 1.02034771, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 2.074702896653299, "language_loss": 0.74391162, "learning_rate": 4.812895462586331e-07, "loss": 0.76538205, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 3.2819712162017822 }, { "auxiliary_loss_clip": 0.01126745, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.04486251, "balance_loss_mlp": 1.0209682, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 1.608433596862151, "language_loss": 0.82066858, "learning_rate": 4.807828030805207e-07, "loss": 0.8422128, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 2.615826368331909 }, { "auxiliary_loss_clip": 0.01146696, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.04606175, "balance_loss_mlp": 1.02128553, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 1.8173695644406185, "language_loss": 0.67633998, "learning_rate": 4.802762903664495e-07, "loss": 0.69809008, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.4908089637756348 }, { "auxiliary_loss_clip": 0.01140653, "auxiliary_loss_mlp": 0.01027806, "balance_loss_clip": 1.04590511, "balance_loss_mlp": 1.02043843, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 9.070221819689818, "language_loss": 0.73735523, "learning_rate": 4.797700081932565e-07, "loss": 0.75903982, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.51689076423645 }, { "auxiliary_loss_clip": 0.0109077, "auxiliary_loss_mlp": 0.01030228, "balance_loss_clip": 1.03769064, "balance_loss_mlp": 1.02315879, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 2.342391542116195, "language_loss": 0.82162005, "learning_rate": 4.792639566377442e-07, "loss": 0.84283, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.6163601875305176 }, { "auxiliary_loss_clip": 0.01143956, "auxiliary_loss_mlp": 0.01027231, "balance_loss_clip": 1.04297376, "balance_loss_mlp": 1.02003407, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 1.5957907069921615, "language_loss": 0.77157086, "learning_rate": 4.78758135776681e-07, "loss": 0.79328275, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 2.5160863399505615 }, { "auxiliary_loss_clip": 0.01138158, "auxiliary_loss_mlp": 0.01029135, "balance_loss_clip": 1.04600811, "balance_loss_mlp": 1.02146113, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 1.9421097432815195, "language_loss": 0.78911799, "learning_rate": 4.782525456867989e-07, "loss": 0.8107909, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 2.5491530895233154 }, { "auxiliary_loss_clip": 0.01122356, "auxiliary_loss_mlp": 0.01023816, "balance_loss_clip": 1.04519081, "balance_loss_mlp": 1.01619208, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 1.5288338747309502, "language_loss": 0.83341682, "learning_rate": 4.777471864447959e-07, "loss": 0.85487854, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.5561225414276123 }, { "auxiliary_loss_clip": 0.01137227, "auxiliary_loss_mlp": 0.01024811, "balance_loss_clip": 1.04304075, "balance_loss_mlp": 1.01822782, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 2.02836879675236, "language_loss": 0.80618054, "learning_rate": 4.772420581273344e-07, "loss": 0.82780087, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 2.5175833702087402 }, { "auxiliary_loss_clip": 0.01146086, "auxiliary_loss_mlp": 0.01022365, "balance_loss_clip": 1.04630637, "balance_loss_mlp": 1.01572752, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 1.8524589084194576, "language_loss": 0.76136214, "learning_rate": 4.7673716081104134e-07, "loss": 0.78304666, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 3.2811100482940674 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.01029061, "balance_loss_clip": 1.04857183, "balance_loss_mlp": 1.02241552, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 1.68563867933467, "language_loss": 0.84693635, "learning_rate": 4.762324945725109e-07, "loss": 0.86872756, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 3.3656256198883057 }, { "auxiliary_loss_clip": 0.0113383, "auxiliary_loss_mlp": 0.01029417, "balance_loss_clip": 1.04780316, "balance_loss_mlp": 1.02289665, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 1.7168648052556679, "language_loss": 0.75630844, "learning_rate": 4.7572805948829844e-07, "loss": 0.77794099, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 2.5579142570495605 }, { "auxiliary_loss_clip": 0.01115301, "auxiliary_loss_mlp": 0.01025269, "balance_loss_clip": 1.0436002, "balance_loss_mlp": 1.01873589, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 1.665977734085383, "language_loss": 0.71022904, "learning_rate": 4.7522385563492795e-07, "loss": 0.73163474, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 3.344266891479492 }, { "auxiliary_loss_clip": 0.01125531, "auxiliary_loss_mlp": 0.01036689, "balance_loss_clip": 1.04485643, "balance_loss_mlp": 1.02979887, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 1.7755482925981065, "language_loss": 0.70295185, "learning_rate": 4.747198830888863e-07, "loss": 0.72457403, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 2.5800187587738037 }, { "auxiliary_loss_clip": 0.0113382, "auxiliary_loss_mlp": 0.01026823, "balance_loss_clip": 1.04430735, "balance_loss_mlp": 1.01976025, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 1.9406361521144146, "language_loss": 0.68176603, "learning_rate": 4.742161419266251e-07, "loss": 0.70337248, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 2.5673305988311768 }, { "auxiliary_loss_clip": 0.01154505, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.04847836, "balance_loss_mlp": 1.02435446, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 2.643026042730957, "language_loss": 0.64946854, "learning_rate": 4.7371263222456304e-07, "loss": 0.67132843, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.5491061210632324 }, { "auxiliary_loss_clip": 0.0104402, "auxiliary_loss_mlp": 0.01002374, "balance_loss_clip": 1.00843453, "balance_loss_mlp": 1.00134313, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 0.7989762737519451, "language_loss": 0.61480826, "learning_rate": 4.7320935405908004e-07, "loss": 0.63527215, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.0544650554656982 }, { "auxiliary_loss_clip": 0.01166419, "auxiliary_loss_mlp": 0.01026991, "balance_loss_clip": 1.04766107, "balance_loss_mlp": 1.01904011, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.256425534944916, "language_loss": 0.84065819, "learning_rate": 4.7270630750652475e-07, "loss": 0.86259234, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.453723192214966 }, { "auxiliary_loss_clip": 0.01147882, "auxiliary_loss_mlp": 0.01022702, "balance_loss_clip": 1.04525208, "balance_loss_mlp": 1.01590443, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 1.648343507656554, "language_loss": 0.80012298, "learning_rate": 4.7220349264320746e-07, "loss": 0.82182884, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.5175106525421143 }, { "auxiliary_loss_clip": 0.01045229, "auxiliary_loss_mlp": 0.01003005, "balance_loss_clip": 1.00824189, "balance_loss_mlp": 1.00199175, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7359818475499356, "language_loss": 0.54951131, "learning_rate": 4.71700909545407e-07, "loss": 0.56999362, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.1136608123779297 }, { "auxiliary_loss_clip": 0.01150534, "auxiliary_loss_mlp": 0.01025974, "balance_loss_clip": 1.04532814, "balance_loss_mlp": 1.01911056, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 3.3863844594218278, "language_loss": 0.7700336, "learning_rate": 4.711985582893627e-07, "loss": 0.79179859, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.477668285369873 }, { "auxiliary_loss_clip": 0.0110966, "auxiliary_loss_mlp": 0.0102479, "balance_loss_clip": 1.03935885, "balance_loss_mlp": 1.01767015, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 1.8056698794777117, "language_loss": 0.72069395, "learning_rate": 4.706964389512811e-07, "loss": 0.74203843, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.6027333736419678 }, { "auxiliary_loss_clip": 0.01163318, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.04851174, "balance_loss_mlp": 1.02392817, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 1.740197051847246, "language_loss": 0.87240738, "learning_rate": 4.701945516073345e-07, "loss": 0.89434791, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.4272100925445557 }, { "auxiliary_loss_clip": 0.01122233, "auxiliary_loss_mlp": 0.01025345, "balance_loss_clip": 1.04602253, "balance_loss_mlp": 1.01861262, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.8068563700734452, "language_loss": 0.75192785, "learning_rate": 4.696928963336577e-07, "loss": 0.77340364, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.595992088317871 }, { "auxiliary_loss_clip": 0.01043598, "auxiliary_loss_mlp": 0.01001169, "balance_loss_clip": 1.00823975, "balance_loss_mlp": 1.00016117, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.9335578673965258, "language_loss": 0.61044312, "learning_rate": 4.6919147320635224e-07, "loss": 0.63089079, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 3.054169178009033 }, { "auxiliary_loss_clip": 0.0115192, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.04476976, "balance_loss_mlp": 1.02207017, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 2.094490430636886, "language_loss": 0.7321499, "learning_rate": 4.6869028230148286e-07, "loss": 0.75395614, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.4916365146636963 }, { "auxiliary_loss_clip": 0.01116727, "auxiliary_loss_mlp": 0.01029236, "balance_loss_clip": 1.04089975, "balance_loss_mlp": 1.02218485, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 2.8972258883940922, "language_loss": 0.60075814, "learning_rate": 4.6818932369507957e-07, "loss": 0.62221777, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 3.525413990020752 }, { "auxiliary_loss_clip": 0.01149054, "auxiliary_loss_mlp": 0.01027395, "balance_loss_clip": 1.0466485, "balance_loss_mlp": 1.02007818, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 1.9103103071980514, "language_loss": 0.88785923, "learning_rate": 4.676885974631386e-07, "loss": 0.90962374, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.4865715503692627 }, { "auxiliary_loss_clip": 0.01152563, "auxiliary_loss_mlp": 0.01022132, "balance_loss_clip": 1.04787815, "balance_loss_mlp": 1.01439524, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 1.978844594948399, "language_loss": 0.81014836, "learning_rate": 4.67188103681619e-07, "loss": 0.83189529, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.5077309608459473 }, { "auxiliary_loss_clip": 0.01144333, "auxiliary_loss_mlp": 0.00760462, "balance_loss_clip": 1.04652739, "balance_loss_mlp": 1.00025547, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 2.185276750032085, "language_loss": 0.68891984, "learning_rate": 4.666878424264453e-07, "loss": 0.70796782, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.4974687099456787 }, { "auxiliary_loss_clip": 0.01125917, "auxiliary_loss_mlp": 0.01023975, "balance_loss_clip": 1.04238415, "balance_loss_mlp": 1.01735306, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 1.5982554999067324, "language_loss": 0.73814362, "learning_rate": 4.661878137735069e-07, "loss": 0.75964254, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.513636589050293 }, { "auxiliary_loss_clip": 0.01137448, "auxiliary_loss_mlp": 0.01026137, "balance_loss_clip": 1.04696262, "balance_loss_mlp": 1.01919007, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 2.0820400886071733, "language_loss": 0.75428712, "learning_rate": 4.656880177986571e-07, "loss": 0.77592289, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.523752212524414 }, { "auxiliary_loss_clip": 0.01139679, "auxiliary_loss_mlp": 0.01030509, "balance_loss_clip": 1.04399037, "balance_loss_mlp": 1.02271914, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 1.9234823988459748, "language_loss": 0.81236011, "learning_rate": 4.6518845457771607e-07, "loss": 0.83406198, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 2.5233147144317627 }, { "auxiliary_loss_clip": 0.0114149, "auxiliary_loss_mlp": 0.00760804, "balance_loss_clip": 1.04370761, "balance_loss_mlp": 1.00022781, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.7286957845191866, "language_loss": 0.79030704, "learning_rate": 4.646891241864652e-07, "loss": 0.80932999, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.4663784503936768 }, { "auxiliary_loss_clip": 0.01147091, "auxiliary_loss_mlp": 0.010294, "balance_loss_clip": 1.04395819, "balance_loss_mlp": 1.02183616, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 1.8159386985430017, "language_loss": 0.73232996, "learning_rate": 4.6419002670065397e-07, "loss": 0.75409484, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 2.4960975646972656 }, { "auxiliary_loss_clip": 0.01129495, "auxiliary_loss_mlp": 0.010277, "balance_loss_clip": 1.04682517, "balance_loss_mlp": 1.01969266, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 1.9496859553763313, "language_loss": 0.86550546, "learning_rate": 4.6369116219599445e-07, "loss": 0.88707733, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 3.318070888519287 }, { "auxiliary_loss_clip": 0.01121618, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.04257298, "balance_loss_mlp": 1.01805151, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 1.573388239489022, "language_loss": 0.79023468, "learning_rate": 4.631925307481637e-07, "loss": 0.81170028, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.587639808654785 }, { "auxiliary_loss_clip": 0.01137508, "auxiliary_loss_mlp": 0.01023173, "balance_loss_clip": 1.04732299, "balance_loss_mlp": 1.01652133, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 2.6418177358372152, "language_loss": 0.75856054, "learning_rate": 4.6269413243280533e-07, "loss": 0.78016734, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 3.357423782348633 }, { "auxiliary_loss_clip": 0.01140069, "auxiliary_loss_mlp": 0.01029258, "balance_loss_clip": 1.04670608, "balance_loss_mlp": 1.02194428, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 3.0411660101213913, "language_loss": 0.74391806, "learning_rate": 4.621959673255236e-07, "loss": 0.76561129, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 3.234729290008545 }, { "auxiliary_loss_clip": 0.01105967, "auxiliary_loss_mlp": 0.01028056, "balance_loss_clip": 1.04244089, "balance_loss_mlp": 1.02121031, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 3.5425540028963582, "language_loss": 0.90295386, "learning_rate": 4.6169803550189135e-07, "loss": 0.92429411, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 2.5640954971313477 }, { "auxiliary_loss_clip": 0.0110041, "auxiliary_loss_mlp": 0.01030551, "balance_loss_clip": 1.04259932, "balance_loss_mlp": 1.02293992, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 2.0251254314030955, "language_loss": 0.77240431, "learning_rate": 4.6120033703744355e-07, "loss": 0.79371393, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 2.567315101623535 }, { "auxiliary_loss_clip": 0.01126092, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.04243743, "balance_loss_mlp": 1.02270365, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 1.9274258473459358, "language_loss": 0.78228951, "learning_rate": 4.607028720076822e-07, "loss": 0.80384278, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.5812530517578125 }, { "auxiliary_loss_clip": 0.01149466, "auxiliary_loss_mlp": 0.01021332, "balance_loss_clip": 1.0464673, "balance_loss_mlp": 1.01427197, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 2.0833905949592673, "language_loss": 0.73407787, "learning_rate": 4.6020564048807074e-07, "loss": 0.75578582, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.532745599746704 }, { "auxiliary_loss_clip": 0.0115324, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.04783583, "balance_loss_mlp": 1.01963389, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 1.8784879344032177, "language_loss": 0.72231752, "learning_rate": 4.5970864255403883e-07, "loss": 0.74412096, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.747436046600342 }, { "auxiliary_loss_clip": 0.01140596, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.04464507, "balance_loss_mlp": 1.02213526, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 1.758276010690744, "language_loss": 0.81901526, "learning_rate": 4.59211878280982e-07, "loss": 0.84070915, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.5154309272766113 }, { "auxiliary_loss_clip": 0.01138051, "auxiliary_loss_mlp": 0.01029672, "balance_loss_clip": 1.04500759, "balance_loss_mlp": 1.02257061, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 2.698983340937698, "language_loss": 0.70154226, "learning_rate": 4.587153477442578e-07, "loss": 0.72321951, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.548576831817627 }, { "auxiliary_loss_clip": 0.01169555, "auxiliary_loss_mlp": 0.01028119, "balance_loss_clip": 1.05047679, "balance_loss_mlp": 1.02082324, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 2.7516088189161865, "language_loss": 0.81445944, "learning_rate": 4.582190510191899e-07, "loss": 0.83643615, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.5127110481262207 }, { "auxiliary_loss_clip": 0.01121064, "auxiliary_loss_mlp": 0.01028139, "balance_loss_clip": 1.044873, "balance_loss_mlp": 1.0215168, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 1.903996707130796, "language_loss": 0.86997771, "learning_rate": 4.5772298818106625e-07, "loss": 0.89146972, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.5808732509613037 }, { "auxiliary_loss_clip": 0.01125687, "auxiliary_loss_mlp": 0.01028039, "balance_loss_clip": 1.04611373, "balance_loss_mlp": 1.02117276, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 2.2335611829875086, "language_loss": 0.71969426, "learning_rate": 4.572271593051384e-07, "loss": 0.7412315, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.619264841079712 }, { "auxiliary_loss_clip": 0.01102593, "auxiliary_loss_mlp": 0.01025183, "balance_loss_clip": 1.04297161, "balance_loss_mlp": 1.01833165, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 1.677518204750608, "language_loss": 0.77856576, "learning_rate": 4.567315644666245e-07, "loss": 0.79984355, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.5928032398223877 }, { "auxiliary_loss_clip": 0.01118346, "auxiliary_loss_mlp": 0.01025194, "balance_loss_clip": 1.04551029, "balance_loss_mlp": 1.0188911, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 2.1818669333368708, "language_loss": 0.84390366, "learning_rate": 4.5623620374070507e-07, "loss": 0.86533904, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.6027333736419678 }, { "auxiliary_loss_clip": 0.01025392, "auxiliary_loss_mlp": 0.01001416, "balance_loss_clip": 1.00803733, "balance_loss_mlp": 1.00024784, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.760864912295606, "language_loss": 0.58434469, "learning_rate": 4.557410772025263e-07, "loss": 0.60461277, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 4.089269638061523 }, { "auxiliary_loss_clip": 0.01133769, "auxiliary_loss_mlp": 0.0102461, "balance_loss_clip": 1.0441153, "balance_loss_mlp": 1.01791906, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 1.7212458642125015, "language_loss": 0.66086709, "learning_rate": 4.5524618492719803e-07, "loss": 0.68245089, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 2.5578396320343018 }, { "auxiliary_loss_clip": 0.01151476, "auxiliary_loss_mlp": 0.01020555, "balance_loss_clip": 1.04602611, "balance_loss_mlp": 1.0138526, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.6066700830234217, "language_loss": 0.78869569, "learning_rate": 4.54751526989795e-07, "loss": 0.81041598, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 2.578993082046509 }, { "auxiliary_loss_clip": 0.01153138, "auxiliary_loss_mlp": 0.01025565, "balance_loss_clip": 1.04659474, "balance_loss_mlp": 1.01871371, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 1.9014530708026893, "language_loss": 0.79132909, "learning_rate": 4.5425710346535775e-07, "loss": 0.81311613, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.4730305671691895 }, { "auxiliary_loss_clip": 0.01153076, "auxiliary_loss_mlp": 0.01024037, "balance_loss_clip": 1.04679251, "balance_loss_mlp": 1.0171293, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 2.118484432448108, "language_loss": 0.81577694, "learning_rate": 4.537629144288877e-07, "loss": 0.83754808, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.5583908557891846 }, { "auxiliary_loss_clip": 0.01110483, "auxiliary_loss_mlp": 0.01028897, "balance_loss_clip": 1.03993082, "balance_loss_mlp": 1.02196765, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 1.8989647464209403, "language_loss": 0.74988788, "learning_rate": 4.5326895995535477e-07, "loss": 0.77128166, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.61181640625 }, { "auxiliary_loss_clip": 0.01148921, "auxiliary_loss_mlp": 0.01026616, "balance_loss_clip": 1.0456171, "balance_loss_mlp": 1.01975572, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 2.323660118224841, "language_loss": 0.84239048, "learning_rate": 4.527752401196907e-07, "loss": 0.86414587, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.5769009590148926 }, { "auxiliary_loss_clip": 0.01130983, "auxiliary_loss_mlp": 0.01025734, "balance_loss_clip": 1.04427826, "balance_loss_mlp": 1.01846278, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 2.2979060673229177, "language_loss": 0.66559875, "learning_rate": 4.5228175499679254e-07, "loss": 0.68716598, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 2.5301008224487305 }, { "auxiliary_loss_clip": 0.0104637, "auxiliary_loss_mlp": 0.01002452, "balance_loss_clip": 1.00863194, "balance_loss_mlp": 1.00132525, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 0.8304992459385439, "language_loss": 0.54563928, "learning_rate": 4.5178850466152174e-07, "loss": 0.56612748, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.185530662536621 }, { "auxiliary_loss_clip": 0.01129949, "auxiliary_loss_mlp": 0.01024084, "balance_loss_clip": 1.04131067, "balance_loss_mlp": 1.01744676, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 1.9204364126700941, "language_loss": 0.8186959, "learning_rate": 4.512954891887031e-07, "loss": 0.84023625, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 2.5286951065063477 }, { "auxiliary_loss_clip": 0.01127546, "auxiliary_loss_mlp": 0.01028037, "balance_loss_clip": 1.043082, "balance_loss_mlp": 1.02078021, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 1.971820357221249, "language_loss": 0.83584082, "learning_rate": 4.5080270865312806e-07, "loss": 0.85739666, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 2.5132339000701904 }, { "auxiliary_loss_clip": 0.01149324, "auxiliary_loss_mlp": 0.01022626, "balance_loss_clip": 1.04500246, "balance_loss_mlp": 1.01602471, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 1.915480435031715, "language_loss": 0.70946866, "learning_rate": 4.5031016312954985e-07, "loss": 0.73118818, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 3.246853828430176 }, { "auxiliary_loss_clip": 0.0115966, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.05003786, "balance_loss_mlp": 1.02417588, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 1.8208343131140012, "language_loss": 0.74703884, "learning_rate": 4.498178526926886e-07, "loss": 0.76895332, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 4.182219505310059 }, { "auxiliary_loss_clip": 0.0116473, "auxiliary_loss_mlp": 0.01028724, "balance_loss_clip": 1.04904723, "balance_loss_mlp": 1.02232265, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.013164464309595, "language_loss": 0.7244612, "learning_rate": 4.4932577741722635e-07, "loss": 0.74639571, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 2.4558377265930176 }, { "auxiliary_loss_clip": 0.0113276, "auxiliary_loss_mlp": 0.01027071, "balance_loss_clip": 1.04368091, "balance_loss_mlp": 1.01996577, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 1.9177824332027407, "language_loss": 0.74020386, "learning_rate": 4.4883393737780985e-07, "loss": 0.76180214, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 2.591503381729126 }, { "auxiliary_loss_clip": 0.01143647, "auxiliary_loss_mlp": 0.01024116, "balance_loss_clip": 1.04405928, "balance_loss_mlp": 1.01748478, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 2.0470126435630758, "language_loss": 0.78473544, "learning_rate": 4.4834233264905254e-07, "loss": 0.80641305, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 2.4825618267059326 }, { "auxiliary_loss_clip": 0.01115775, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.04064918, "balance_loss_mlp": 1.02101254, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 2.2413293605828377, "language_loss": 0.71740973, "learning_rate": 4.478509633055294e-07, "loss": 0.7388556, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.5364372730255127 }, { "auxiliary_loss_clip": 0.01140433, "auxiliary_loss_mlp": 0.01024015, "balance_loss_clip": 1.04634404, "balance_loss_mlp": 1.01606417, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 2.377196273341085, "language_loss": 0.80166364, "learning_rate": 4.473598294217813e-07, "loss": 0.82330811, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.524785041809082 }, { "auxiliary_loss_clip": 0.01149623, "auxiliary_loss_mlp": 0.01027304, "balance_loss_clip": 1.04732227, "balance_loss_mlp": 1.02039027, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.221040590571018, "language_loss": 0.71823525, "learning_rate": 4.468689310723124e-07, "loss": 0.74000454, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.5010581016540527 }, { "auxiliary_loss_clip": 0.01126227, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.04329622, "balance_loss_mlp": 1.023314, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.6673561926721985, "language_loss": 0.78807998, "learning_rate": 4.463782683315913e-07, "loss": 0.80964607, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.554352283477783 }, { "auxiliary_loss_clip": 0.01162033, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.04737687, "balance_loss_mlp": 1.01849651, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 1.793289201887468, "language_loss": 0.73334587, "learning_rate": 4.458878412740523e-07, "loss": 0.75521785, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.482306957244873 }, { "auxiliary_loss_clip": 0.01145805, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 1.04653108, "balance_loss_mlp": 1.01815283, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 2.4210298186030292, "language_loss": 0.77454484, "learning_rate": 4.453976499740919e-07, "loss": 0.79625279, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.470735549926758 }, { "auxiliary_loss_clip": 0.01148835, "auxiliary_loss_mlp": 0.01025658, "balance_loss_clip": 1.04876828, "balance_loss_mlp": 1.01876128, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 2.005765110692006, "language_loss": 0.77781439, "learning_rate": 4.4490769450607215e-07, "loss": 0.79955935, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.47442626953125 }, { "auxiliary_loss_clip": 0.01118526, "auxiliary_loss_mlp": 0.0102595, "balance_loss_clip": 1.03976846, "balance_loss_mlp": 1.01885104, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 1.824234249984581, "language_loss": 0.72494495, "learning_rate": 4.4441797494431845e-07, "loss": 0.74638963, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 2.741027355194092 }, { "auxiliary_loss_clip": 0.01150599, "auxiliary_loss_mlp": 0.01033876, "balance_loss_clip": 1.0481739, "balance_loss_mlp": 1.02648211, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 4.696847151844976, "language_loss": 0.77913916, "learning_rate": 4.439284913631207e-07, "loss": 0.80098391, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.6032888889312744 }, { "auxiliary_loss_clip": 0.0112317, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.04580104, "balance_loss_mlp": 1.02128744, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 1.8216273393638742, "language_loss": 0.8386941, "learning_rate": 4.434392438367347e-07, "loss": 0.86021483, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 2.601834297180176 }, { "auxiliary_loss_clip": 0.01150152, "auxiliary_loss_mlp": 0.01027108, "balance_loss_clip": 1.04529345, "balance_loss_mlp": 1.02045596, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 1.8248478663365675, "language_loss": 0.73779821, "learning_rate": 4.4295023243937677e-07, "loss": 0.75957084, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.563201665878296 }, { "auxiliary_loss_clip": 0.01149883, "auxiliary_loss_mlp": 0.01026429, "balance_loss_clip": 1.04789782, "balance_loss_mlp": 1.01863241, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 1.6342362322240132, "language_loss": 0.80083525, "learning_rate": 4.4246145724523123e-07, "loss": 0.82259834, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 3.3016839027404785 }, { "auxiliary_loss_clip": 0.01122774, "auxiliary_loss_mlp": 0.01024249, "balance_loss_clip": 1.04568315, "balance_loss_mlp": 1.01694143, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.133116550740585, "language_loss": 0.77217096, "learning_rate": 4.41972918328444e-07, "loss": 0.79364121, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 2.5421082973480225 }, { "auxiliary_loss_clip": 0.01151223, "auxiliary_loss_mlp": 0.01034327, "balance_loss_clip": 1.04860938, "balance_loss_mlp": 1.02734733, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 2.0517758680319895, "language_loss": 0.77630448, "learning_rate": 4.4148461576312646e-07, "loss": 0.79815996, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.5662777423858643 }, { "auxiliary_loss_clip": 0.01151506, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 1.0480001, "balance_loss_mlp": 1.01815128, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.522207665126103, "language_loss": 0.74633276, "learning_rate": 4.4099654962335343e-07, "loss": 0.76809698, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.52166748046875 }, { "auxiliary_loss_clip": 0.01145975, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.0477432, "balance_loss_mlp": 1.02118754, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 1.7162982234902366, "language_loss": 0.75044149, "learning_rate": 4.405087199831636e-07, "loss": 0.77218825, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.57381534576416 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.00760505, "balance_loss_clip": 1.04330015, "balance_loss_mlp": 1.00023937, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 1.9595734191099345, "language_loss": 0.6692102, "learning_rate": 4.400211269165619e-07, "loss": 0.68819028, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.5479655265808105 }, { "auxiliary_loss_clip": 0.01167803, "auxiliary_loss_mlp": 0.0102889, "balance_loss_clip": 1.05225348, "balance_loss_mlp": 1.02178812, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.743187620008219, "language_loss": 0.76746804, "learning_rate": 4.3953377049751416e-07, "loss": 0.78943497, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 2.4746673107147217 }, { "auxiliary_loss_clip": 0.01141219, "auxiliary_loss_mlp": 0.01025047, "balance_loss_clip": 1.04764462, "balance_loss_mlp": 1.0185709, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 2.147341565890573, "language_loss": 0.77872324, "learning_rate": 4.390466507999537e-07, "loss": 0.80038595, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 2.495028018951416 }, { "auxiliary_loss_clip": 0.01117184, "auxiliary_loss_mlp": 0.01026257, "balance_loss_clip": 1.04293323, "balance_loss_mlp": 1.01939678, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 1.9929535220813694, "language_loss": 0.75689065, "learning_rate": 4.385597678977748e-07, "loss": 0.77832508, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 2.5318000316619873 }, { "auxiliary_loss_clip": 0.01131249, "auxiliary_loss_mlp": 0.01028411, "balance_loss_clip": 1.04184687, "balance_loss_mlp": 1.02078772, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 1.5531971008921983, "language_loss": 0.7540316, "learning_rate": 4.3807312186483726e-07, "loss": 0.77562821, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 3.340477705001831 }, { "auxiliary_loss_clip": 0.01149258, "auxiliary_loss_mlp": 0.01026979, "balance_loss_clip": 1.0487802, "balance_loss_mlp": 1.01952195, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 1.9644286835395501, "language_loss": 0.78514743, "learning_rate": 4.375867127749655e-07, "loss": 0.8069098, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.4824368953704834 }, { "auxiliary_loss_clip": 0.01124678, "auxiliary_loss_mlp": 0.01025529, "balance_loss_clip": 1.04537022, "balance_loss_mlp": 1.01841843, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 2.371266189956271, "language_loss": 0.67202908, "learning_rate": 4.3710054070194744e-07, "loss": 0.69353116, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 3.39778733253479 }, { "auxiliary_loss_clip": 0.01165373, "auxiliary_loss_mlp": 0.00761136, "balance_loss_clip": 1.04705024, "balance_loss_mlp": 1.00025678, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 2.7176262955585675, "language_loss": 0.66416442, "learning_rate": 4.3661460571953455e-07, "loss": 0.68342948, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 3.166304588317871 }, { "auxiliary_loss_clip": 0.01150951, "auxiliary_loss_mlp": 0.01021042, "balance_loss_clip": 1.0450902, "balance_loss_mlp": 1.01450372, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 1.5334389942060427, "language_loss": 0.68643111, "learning_rate": 4.36128907901443e-07, "loss": 0.70815104, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 2.5147759914398193 }, { "auxiliary_loss_clip": 0.01122664, "auxiliary_loss_mlp": 0.0102457, "balance_loss_clip": 1.0421598, "balance_loss_mlp": 1.01728654, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 1.9616576293152, "language_loss": 0.72468239, "learning_rate": 4.356434473213519e-07, "loss": 0.74615467, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 2.5261049270629883 }, { "auxiliary_loss_clip": 0.01136681, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 1.04841745, "balance_loss_mlp": 1.02073669, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 1.7140157467386665, "language_loss": 0.79905188, "learning_rate": 4.351582240529068e-07, "loss": 0.82069057, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.5357437133789062 }, { "auxiliary_loss_clip": 0.01038962, "auxiliary_loss_mlp": 0.01001461, "balance_loss_clip": 1.00795221, "balance_loss_mlp": 1.00031018, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.7038560101057136, "language_loss": 0.58225006, "learning_rate": 4.346732381697149e-07, "loss": 0.60265428, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.1610167026519775 }, { "auxiliary_loss_clip": 0.01134014, "auxiliary_loss_mlp": 0.01029904, "balance_loss_clip": 1.04685128, "balance_loss_mlp": 1.02311814, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 2.0025836263401926, "language_loss": 0.81038892, "learning_rate": 4.3418848974534825e-07, "loss": 0.83202803, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.5060744285583496 }, { "auxiliary_loss_clip": 0.01127394, "auxiliary_loss_mlp": 0.01028063, "balance_loss_clip": 1.04485846, "balance_loss_mlp": 1.02104473, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 1.583174532761633, "language_loss": 0.68685764, "learning_rate": 4.3370397885334276e-07, "loss": 0.70841217, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.666896343231201 }, { "auxiliary_loss_clip": 0.01146083, "auxiliary_loss_mlp": 0.01030254, "balance_loss_clip": 1.04801726, "balance_loss_mlp": 1.02318764, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 1.7026733696224499, "language_loss": 0.75381243, "learning_rate": 4.3321970556719777e-07, "loss": 0.77557588, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.486280918121338 }, { "auxiliary_loss_clip": 0.01162719, "auxiliary_loss_mlp": 0.01029566, "balance_loss_clip": 1.04746091, "balance_loss_mlp": 1.02179384, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 2.148946758554579, "language_loss": 0.71746063, "learning_rate": 4.3273566996037856e-07, "loss": 0.73938346, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.4435129165649414 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.01025137, "balance_loss_clip": 1.04625237, "balance_loss_mlp": 1.01851773, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 1.9765610188175706, "language_loss": 0.80409986, "learning_rate": 4.322518721063113e-07, "loss": 0.82573438, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.5627260208129883 }, { "auxiliary_loss_clip": 0.01152308, "auxiliary_loss_mlp": 0.01027149, "balance_loss_clip": 1.04783463, "balance_loss_mlp": 1.01985383, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 2.0239949022226553, "language_loss": 0.70101738, "learning_rate": 4.3176831207838906e-07, "loss": 0.722812, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.6100752353668213 }, { "auxiliary_loss_clip": 0.01149724, "auxiliary_loss_mlp": 0.01024577, "balance_loss_clip": 1.04844832, "balance_loss_mlp": 1.01825011, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 1.7893116262424522, "language_loss": 0.74648607, "learning_rate": 4.3128498994996685e-07, "loss": 0.76822907, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.5418381690979004 }, { "auxiliary_loss_clip": 0.01155891, "auxiliary_loss_mlp": 0.01029783, "balance_loss_clip": 1.04768372, "balance_loss_mlp": 1.02251148, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 3.398297190593282, "language_loss": 0.71302295, "learning_rate": 4.308019057943646e-07, "loss": 0.73487973, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 2.5587828159332275 }, { "auxiliary_loss_clip": 0.01114495, "auxiliary_loss_mlp": 0.01022518, "balance_loss_clip": 1.04413605, "balance_loss_mlp": 1.01605415, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 1.5925909807756664, "language_loss": 0.74619514, "learning_rate": 4.3031905968486535e-07, "loss": 0.76756531, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 3.368894338607788 }, { "auxiliary_loss_clip": 0.01109897, "auxiliary_loss_mlp": 0.01025383, "balance_loss_clip": 1.04560375, "balance_loss_mlp": 1.01850152, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 1.9948816406834091, "language_loss": 0.68417454, "learning_rate": 4.298364516947162e-07, "loss": 0.70552725, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.5723087787628174 }, { "auxiliary_loss_clip": 0.01105033, "auxiliary_loss_mlp": 0.01022177, "balance_loss_clip": 1.0414629, "balance_loss_mlp": 1.01566505, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 1.7909913735749496, "language_loss": 0.6523906, "learning_rate": 4.293540818971295e-07, "loss": 0.67366266, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.587277889251709 }, { "auxiliary_loss_clip": 0.01153989, "auxiliary_loss_mlp": 0.0102398, "balance_loss_clip": 1.04600072, "balance_loss_mlp": 1.0171349, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 1.9606597166038036, "language_loss": 0.76525217, "learning_rate": 4.2887195036527934e-07, "loss": 0.78703183, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 2.5142898559570312 }, { "auxiliary_loss_clip": 0.01142244, "auxiliary_loss_mlp": 0.01024075, "balance_loss_clip": 1.04367864, "balance_loss_mlp": 1.01688051, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 2.6100554389737978, "language_loss": 0.73181641, "learning_rate": 4.28390057172306e-07, "loss": 0.75347966, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.462362766265869 }, { "auxiliary_loss_clip": 0.01115957, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 1.0402174, "balance_loss_mlp": 1.01905608, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 2.2607773672117957, "language_loss": 0.72081506, "learning_rate": 4.279084023913111e-07, "loss": 0.74223888, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.5864293575286865 }, { "auxiliary_loss_clip": 0.01149802, "auxiliary_loss_mlp": 0.0102848, "balance_loss_clip": 1.04692185, "balance_loss_mlp": 1.0218637, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 1.6500159770580556, "language_loss": 0.69047308, "learning_rate": 4.2742698609536096e-07, "loss": 0.7122559, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 2.485440969467163 }, { "auxiliary_loss_clip": 0.0114003, "auxiliary_loss_mlp": 0.01025886, "balance_loss_clip": 1.04618764, "balance_loss_mlp": 1.01915419, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 1.6323515738981953, "language_loss": 0.78206521, "learning_rate": 4.2694580835748706e-07, "loss": 0.80372441, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.569101572036743 }, { "auxiliary_loss_clip": 0.01134812, "auxiliary_loss_mlp": 0.01026422, "balance_loss_clip": 1.04673719, "balance_loss_mlp": 1.01935911, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 1.9419003508898611, "language_loss": 0.74163848, "learning_rate": 4.264648692506836e-07, "loss": 0.76325083, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 2.527162551879883 }, { "auxiliary_loss_clip": 0.01130562, "auxiliary_loss_mlp": 0.01040733, "balance_loss_clip": 1.04375148, "balance_loss_mlp": 1.03288007, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 2.104910375317638, "language_loss": 0.72087961, "learning_rate": 4.2598416884790824e-07, "loss": 0.74259257, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.5647928714752197 }, { "auxiliary_loss_clip": 0.01144377, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.0443393, "balance_loss_mlp": 1.02356887, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 1.910184028955897, "language_loss": 0.80661285, "learning_rate": 4.255037072220828e-07, "loss": 0.82836163, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 3.3658664226531982 }, { "auxiliary_loss_clip": 0.01161056, "auxiliary_loss_mlp": 0.01022036, "balance_loss_clip": 1.04748893, "balance_loss_mlp": 1.01522303, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 1.5703541711064393, "language_loss": 0.71702212, "learning_rate": 4.2502348444609293e-07, "loss": 0.73885298, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 3.303710699081421 }, { "auxiliary_loss_clip": 0.01104208, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.03860843, "balance_loss_mlp": 1.01914275, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 1.770151523844581, "language_loss": 0.69898438, "learning_rate": 4.2454350059278844e-07, "loss": 0.72028828, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.643118381500244 }, { "auxiliary_loss_clip": 0.01129147, "auxiliary_loss_mlp": 0.01024179, "balance_loss_clip": 1.03966749, "balance_loss_mlp": 1.01757777, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 1.6778179939915567, "language_loss": 0.8396762, "learning_rate": 4.240637557349824e-07, "loss": 0.86120945, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 3.274561643600464 }, { "auxiliary_loss_clip": 0.01122231, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.04258704, "balance_loss_mlp": 1.01925027, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 1.8290358827188562, "language_loss": 0.66872811, "learning_rate": 4.235842499454516e-07, "loss": 0.69021499, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 2.582728147506714 }, { "auxiliary_loss_clip": 0.01136936, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.04491997, "balance_loss_mlp": 1.02680922, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 1.6836686437911408, "language_loss": 0.82876384, "learning_rate": 4.2310498329693687e-07, "loss": 0.85046947, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 2.5627050399780273 }, { "auxiliary_loss_clip": 0.01153452, "auxiliary_loss_mlp": 0.01027729, "balance_loss_clip": 1.04745746, "balance_loss_mlp": 1.01995039, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 1.5446957459098858, "language_loss": 0.80763626, "learning_rate": 4.2262595586214164e-07, "loss": 0.8294481, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.5603179931640625 }, { "auxiliary_loss_clip": 0.01153408, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.04655671, "balance_loss_mlp": 1.02113533, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 1.7578453221163097, "language_loss": 0.76657647, "learning_rate": 4.221471677137358e-07, "loss": 0.78839391, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.5413687229156494 }, { "auxiliary_loss_clip": 0.01126073, "auxiliary_loss_mlp": 0.01021984, "balance_loss_clip": 1.0436511, "balance_loss_mlp": 1.01548672, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 1.5626755915499213, "language_loss": 0.70025235, "learning_rate": 4.216686189243492e-07, "loss": 0.72173297, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.5288848876953125 }, { "auxiliary_loss_clip": 0.01118781, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.04407048, "balance_loss_mlp": 1.02368641, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 1.5816847304286525, "language_loss": 0.72719777, "learning_rate": 4.211903095665785e-07, "loss": 0.74870169, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.5549676418304443 }, { "auxiliary_loss_clip": 0.01144839, "auxiliary_loss_mlp": 0.01025014, "balance_loss_clip": 1.04430747, "balance_loss_mlp": 1.01821017, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.908723232002634, "language_loss": 0.75564939, "learning_rate": 4.2071223971298277e-07, "loss": 0.77734786, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.5222716331481934 }, { "auxiliary_loss_clip": 0.01151193, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.04591358, "balance_loss_mlp": 1.02294517, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 2.4195932859947993, "language_loss": 0.60969168, "learning_rate": 4.2023440943608433e-07, "loss": 0.63150728, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.52785325050354 }, { "auxiliary_loss_clip": 0.01147638, "auxiliary_loss_mlp": 0.01027321, "balance_loss_clip": 1.04353511, "balance_loss_mlp": 1.02084231, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.501799860551746, "language_loss": 0.78099811, "learning_rate": 4.1975681880837023e-07, "loss": 0.80274773, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.488443374633789 }, { "auxiliary_loss_clip": 0.01115712, "auxiliary_loss_mlp": 0.01025577, "balance_loss_clip": 1.03789806, "balance_loss_mlp": 1.01858246, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.8143565417476109, "language_loss": 0.82225442, "learning_rate": 4.192794679022895e-07, "loss": 0.84366727, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.5728352069854736 }, { "auxiliary_loss_clip": 0.01149274, "auxiliary_loss_mlp": 0.01028157, "balance_loss_clip": 1.04451632, "balance_loss_mlp": 1.02171707, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 1.91859315275791, "language_loss": 0.72212398, "learning_rate": 4.1880235679025743e-07, "loss": 0.74389827, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.575850248336792 }, { "auxiliary_loss_clip": 0.01095833, "auxiliary_loss_mlp": 0.01023294, "balance_loss_clip": 1.04130685, "balance_loss_mlp": 1.01589096, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 1.75079322329521, "language_loss": 0.63889122, "learning_rate": 4.1832548554464986e-07, "loss": 0.66008246, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 2.658163547515869 }, { "auxiliary_loss_clip": 0.01042895, "auxiliary_loss_mlp": 0.01001104, "balance_loss_clip": 1.00769544, "balance_loss_mlp": 1.000103, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7427544454454184, "language_loss": 0.58796155, "learning_rate": 4.178488542378098e-07, "loss": 0.60840154, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.8376355171203613 }, { "auxiliary_loss_clip": 0.01167082, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 1.04791689, "balance_loss_mlp": 1.01863563, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 1.7135897055860592, "language_loss": 0.88920403, "learning_rate": 4.173724629420401e-07, "loss": 0.91113192, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.549018383026123 }, { "auxiliary_loss_clip": 0.01140039, "auxiliary_loss_mlp": 0.01025869, "balance_loss_clip": 1.04665935, "balance_loss_mlp": 1.01903534, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 3.2001831721414007, "language_loss": 0.68231958, "learning_rate": 4.168963117296087e-07, "loss": 0.70397866, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.5316152572631836 }, { "auxiliary_loss_clip": 0.0116657, "auxiliary_loss_mlp": 0.01026233, "balance_loss_clip": 1.04945374, "balance_loss_mlp": 1.01925623, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 2.913187759763036, "language_loss": 0.7602942, "learning_rate": 4.1642040067274876e-07, "loss": 0.78222221, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 2.4640471935272217 }, { "auxiliary_loss_clip": 0.01139551, "auxiliary_loss_mlp": 0.01022963, "balance_loss_clip": 1.04398465, "balance_loss_mlp": 1.01629925, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 1.6379379016483138, "language_loss": 0.72587901, "learning_rate": 4.1594472984365493e-07, "loss": 0.74750417, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.534360647201538 }, { "auxiliary_loss_clip": 0.01145152, "auxiliary_loss_mlp": 0.01025236, "balance_loss_clip": 1.04581189, "balance_loss_mlp": 1.01842606, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 1.8830406408743918, "language_loss": 0.77529478, "learning_rate": 4.154692993144862e-07, "loss": 0.79699862, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.6311869621276855 }, { "auxiliary_loss_clip": 0.01164311, "auxiliary_loss_mlp": 0.00760702, "balance_loss_clip": 1.04761207, "balance_loss_mlp": 1.00027835, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.017246618167033, "language_loss": 0.71483135, "learning_rate": 4.1499410915736476e-07, "loss": 0.73408151, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 2.466123104095459 }, { "auxiliary_loss_clip": 0.01048968, "auxiliary_loss_mlp": 0.01004043, "balance_loss_clip": 1.01034272, "balance_loss_mlp": 1.0029757, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.7618349192891694, "language_loss": 0.64265859, "learning_rate": 4.145191594443762e-07, "loss": 0.6631887, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.2255425453186035 }, { "auxiliary_loss_clip": 0.01116541, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.04356503, "balance_loss_mlp": 1.0264442, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 1.5259003757178222, "language_loss": 0.70639694, "learning_rate": 4.140444502475713e-07, "loss": 0.72790104, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 2.545480966567993 }, { "auxiliary_loss_clip": 0.01148883, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.04664421, "balance_loss_mlp": 1.02458858, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 1.7931122894568114, "language_loss": 0.70022273, "learning_rate": 4.1356998163896216e-07, "loss": 0.72202766, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 2.4722578525543213 }, { "auxiliary_loss_clip": 0.01128401, "auxiliary_loss_mlp": 0.01024946, "balance_loss_clip": 1.04485011, "balance_loss_mlp": 1.01804411, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 1.9760259237073492, "language_loss": 0.74668521, "learning_rate": 4.130957536905255e-07, "loss": 0.76821864, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 2.556365489959717 }, { "auxiliary_loss_clip": 0.0114273, "auxiliary_loss_mlp": 0.01031861, "balance_loss_clip": 1.04714036, "balance_loss_mlp": 1.02442253, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 2.4764999787077873, "language_loss": 0.70832217, "learning_rate": 4.1262176647420134e-07, "loss": 0.73006809, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 3.325406551361084 }, { "auxiliary_loss_clip": 0.01141177, "auxiliary_loss_mlp": 0.01025411, "balance_loss_clip": 1.04513645, "balance_loss_mlp": 1.01854479, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 1.7280980014924456, "language_loss": 0.79478908, "learning_rate": 4.121480200618923e-07, "loss": 0.81645489, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 4.008262872695923 }, { "auxiliary_loss_clip": 0.01129792, "auxiliary_loss_mlp": 0.01025868, "balance_loss_clip": 1.04415441, "balance_loss_mlp": 1.01836395, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 1.734862808406453, "language_loss": 0.80258644, "learning_rate": 4.116745145254674e-07, "loss": 0.82414305, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.5202112197875977 }, { "auxiliary_loss_clip": 0.0103253, "auxiliary_loss_mlp": 0.01001289, "balance_loss_clip": 1.00897694, "balance_loss_mlp": 1.00018072, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.7685837897642603, "language_loss": 0.58041567, "learning_rate": 4.1120124993675476e-07, "loss": 0.60075384, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 3.1237452030181885 }, { "auxiliary_loss_clip": 0.01144942, "auxiliary_loss_mlp": 0.01028602, "balance_loss_clip": 1.04604197, "balance_loss_mlp": 1.02133667, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 1.867603598885377, "language_loss": 0.61795938, "learning_rate": 4.107282263675498e-07, "loss": 0.63969481, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 2.499401330947876 }, { "auxiliary_loss_clip": 0.01033041, "auxiliary_loss_mlp": 0.00751179, "balance_loss_clip": 1.01074207, "balance_loss_mlp": 1.00009394, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.7697077992936361, "language_loss": 0.52542174, "learning_rate": 4.1025544388960907e-07, "loss": 0.54326397, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.150620460510254 }, { "auxiliary_loss_clip": 0.01148833, "auxiliary_loss_mlp": 0.01024567, "balance_loss_clip": 1.0469141, "balance_loss_mlp": 1.01733959, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 1.7254431601377087, "language_loss": 0.71554267, "learning_rate": 4.097829025746538e-07, "loss": 0.73727673, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.5658533573150635 }, { "auxiliary_loss_clip": 0.01046766, "auxiliary_loss_mlp": 0.01000616, "balance_loss_clip": 1.00984478, "balance_loss_mlp": 0.99953759, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.6576310862352848, "language_loss": 0.61018163, "learning_rate": 4.0931060249436757e-07, "loss": 0.63065547, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.118819236755371 }, { "auxiliary_loss_clip": 0.01148992, "auxiliary_loss_mlp": 0.01030481, "balance_loss_clip": 1.04840231, "balance_loss_mlp": 1.02314687, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 1.9479141169163285, "language_loss": 0.69514942, "learning_rate": 4.088385437203978e-07, "loss": 0.71694422, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.4912755489349365 }, { "auxiliary_loss_clip": 0.01165528, "auxiliary_loss_mlp": 0.01029238, "balance_loss_clip": 1.04766035, "balance_loss_mlp": 1.02172232, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 1.9436977801899171, "language_loss": 0.77824122, "learning_rate": 4.083667263243564e-07, "loss": 0.8001889, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.444019317626953 }, { "auxiliary_loss_clip": 0.01146411, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.0472039, "balance_loss_mlp": 1.021824, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 1.83554357887033, "language_loss": 0.71876597, "learning_rate": 4.0789515037781653e-07, "loss": 0.74052233, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.5194504261016846 }, { "auxiliary_loss_clip": 0.01153593, "auxiliary_loss_mlp": 0.0103105, "balance_loss_clip": 1.04677653, "balance_loss_mlp": 1.023772, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 1.714062898542251, "language_loss": 0.82390612, "learning_rate": 4.0742381595231755e-07, "loss": 0.84575254, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.4593935012817383 }, { "auxiliary_loss_clip": 0.01124924, "auxiliary_loss_mlp": 0.01025515, "balance_loss_clip": 1.04492974, "balance_loss_mlp": 1.01866353, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 2.535993953280498, "language_loss": 0.7816413, "learning_rate": 4.06952723119359e-07, "loss": 0.80314571, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.5530855655670166 }, { "auxiliary_loss_clip": 0.01130984, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.04555178, "balance_loss_mlp": 1.026425, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 1.8148378036008224, "language_loss": 0.67171383, "learning_rate": 4.0648187195040504e-07, "loss": 0.69335842, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.6629462242126465 }, { "auxiliary_loss_clip": 0.01042793, "auxiliary_loss_mlp": 0.01001356, "balance_loss_clip": 1.00783908, "balance_loss_mlp": 1.00024164, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 1.4580852878270008, "language_loss": 0.67622507, "learning_rate": 4.060112625168848e-07, "loss": 0.6966666, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 3.171459197998047 }, { "auxiliary_loss_clip": 0.0116375, "auxiliary_loss_mlp": 0.01027719, "balance_loss_clip": 1.04872084, "balance_loss_mlp": 1.02021146, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 1.8402820938763091, "language_loss": 0.74077052, "learning_rate": 4.055408948901886e-07, "loss": 0.76268518, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 3.2113137245178223 }, { "auxiliary_loss_clip": 0.01153984, "auxiliary_loss_mlp": 0.01031917, "balance_loss_clip": 1.04714298, "balance_loss_mlp": 1.02452052, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 1.6890959991114904, "language_loss": 0.7120319, "learning_rate": 4.050707691416708e-07, "loss": 0.73389089, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.537909984588623 }, { "auxiliary_loss_clip": 0.01043095, "auxiliary_loss_mlp": 0.01001118, "balance_loss_clip": 1.0077951, "balance_loss_mlp": 1.0000447, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.6815958035072053, "language_loss": 0.5981791, "learning_rate": 4.046008853426495e-07, "loss": 0.61862123, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.1463377475738525 }, { "auxiliary_loss_clip": 0.01118954, "auxiliary_loss_mlp": 0.010251, "balance_loss_clip": 1.04474628, "balance_loss_mlp": 1.01783729, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 1.679718745540682, "language_loss": 0.62453675, "learning_rate": 4.0413124356440464e-07, "loss": 0.64597726, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 2.6492037773132324 }, { "auxiliary_loss_clip": 0.01110931, "auxiliary_loss_mlp": 0.0102511, "balance_loss_clip": 1.04154587, "balance_loss_mlp": 1.01786852, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 2.849524047350204, "language_loss": 0.82267046, "learning_rate": 4.0366184387818223e-07, "loss": 0.84403086, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.602039098739624 }, { "auxiliary_loss_clip": 0.01168857, "auxiliary_loss_mlp": 0.01024928, "balance_loss_clip": 1.04915285, "balance_loss_mlp": 1.01763225, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 2.2324361412677165, "language_loss": 0.84952039, "learning_rate": 4.0319268635518797e-07, "loss": 0.87145823, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 2.516737461090088 }, { "auxiliary_loss_clip": 0.01152529, "auxiliary_loss_mlp": 0.01026825, "balance_loss_clip": 1.04691815, "balance_loss_mlp": 1.0199523, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 1.495801578830262, "language_loss": 0.74953699, "learning_rate": 4.027237710665943e-07, "loss": 0.7713306, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 2.5172903537750244 }, { "auxiliary_loss_clip": 0.01127109, "auxiliary_loss_mlp": 0.0102462, "balance_loss_clip": 1.04311562, "balance_loss_mlp": 1.01690102, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 1.685217632641309, "language_loss": 0.69631088, "learning_rate": 4.022550980835344e-07, "loss": 0.71782815, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 2.610893487930298 }, { "auxiliary_loss_clip": 0.01120327, "auxiliary_loss_mlp": 0.01030568, "balance_loss_clip": 1.04117632, "balance_loss_mlp": 1.02388024, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 2.283537488800438, "language_loss": 0.79491341, "learning_rate": 4.017866674771051e-07, "loss": 0.81642228, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 2.5495524406433105 }, { "auxiliary_loss_clip": 0.0109941, "auxiliary_loss_mlp": 0.01027934, "balance_loss_clip": 1.03772807, "balance_loss_mlp": 1.0207696, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 1.6538804311885569, "language_loss": 0.74134356, "learning_rate": 4.013184793183688e-07, "loss": 0.76261705, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 2.6161577701568604 }, { "auxiliary_loss_clip": 0.01151512, "auxiliary_loss_mlp": 0.01027766, "balance_loss_clip": 1.04620838, "balance_loss_mlp": 1.02122414, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 1.7714687482939586, "language_loss": 0.72577608, "learning_rate": 4.008505336783472e-07, "loss": 0.74756885, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 3.2638626098632812 }, { "auxiliary_loss_clip": 0.01141274, "auxiliary_loss_mlp": 0.01027749, "balance_loss_clip": 1.04392076, "balance_loss_mlp": 1.02080822, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 1.7530701023660586, "language_loss": 0.80653691, "learning_rate": 4.003828306280284e-07, "loss": 0.82822716, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 3.304621696472168 }, { "auxiliary_loss_clip": 0.01148187, "auxiliary_loss_mlp": 0.01023801, "balance_loss_clip": 1.04645801, "balance_loss_mlp": 1.01735187, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 1.7454552554022325, "language_loss": 0.78191954, "learning_rate": 3.999153702383626e-07, "loss": 0.80363941, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 3.2237563133239746 }, { "auxiliary_loss_clip": 0.01154592, "auxiliary_loss_mlp": 0.01027577, "balance_loss_clip": 1.04677439, "balance_loss_mlp": 1.02038002, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 1.6331992235062247, "language_loss": 0.73574305, "learning_rate": 3.9944815258026263e-07, "loss": 0.75756478, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 2.562884569168091 }, { "auxiliary_loss_clip": 0.0115534, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.04824936, "balance_loss_mlp": 1.02694988, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 1.6012660306524586, "language_loss": 0.82895231, "learning_rate": 3.989811777246057e-07, "loss": 0.85084963, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.545543670654297 }, { "auxiliary_loss_clip": 0.01056077, "auxiliary_loss_mlp": 0.01002363, "balance_loss_clip": 1.00847709, "balance_loss_mlp": 1.00133157, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8487935945178949, "language_loss": 0.66268891, "learning_rate": 3.985144457422305e-07, "loss": 0.68327332, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 3.001004695892334 }, { "auxiliary_loss_clip": 0.01164849, "auxiliary_loss_mlp": 0.01029286, "balance_loss_clip": 1.0479877, "balance_loss_mlp": 1.0227859, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 1.7844329873290972, "language_loss": 0.76428503, "learning_rate": 3.9804795670394096e-07, "loss": 0.78622633, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.48689603805542 }, { "auxiliary_loss_clip": 0.01127435, "auxiliary_loss_mlp": 0.01031529, "balance_loss_clip": 1.04303479, "balance_loss_mlp": 1.02462065, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 1.641842118719, "language_loss": 0.70469052, "learning_rate": 3.975817106805022e-07, "loss": 0.72628009, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.5324020385742188 }, { "auxiliary_loss_clip": 0.01125939, "auxiliary_loss_mlp": 0.01027595, "balance_loss_clip": 1.04499674, "balance_loss_mlp": 1.02050543, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 1.9323712904365193, "language_loss": 0.65140826, "learning_rate": 3.97115707742645e-07, "loss": 0.67294359, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.674323558807373 }, { "auxiliary_loss_clip": 0.01138966, "auxiliary_loss_mlp": 0.01030608, "balance_loss_clip": 1.04710543, "balance_loss_mlp": 1.02381265, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 1.7624285274749412, "language_loss": 0.64961398, "learning_rate": 3.966499479610599e-07, "loss": 0.67130971, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.5324535369873047 }, { "auxiliary_loss_clip": 0.01119725, "auxiliary_loss_mlp": 0.0102988, "balance_loss_clip": 1.04435897, "balance_loss_mlp": 1.02337706, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 2.03527951757548, "language_loss": 0.65211892, "learning_rate": 3.9618443140640225e-07, "loss": 0.67361498, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.608290433883667 }, { "auxiliary_loss_clip": 0.01012436, "auxiliary_loss_mlp": 0.01000528, "balance_loss_clip": 1.00780821, "balance_loss_mlp": 0.99947882, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 0.6880317878935087, "language_loss": 0.51449841, "learning_rate": 3.957191581492918e-07, "loss": 0.53462803, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.183957099914551 }, { "auxiliary_loss_clip": 0.01135307, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.0466615, "balance_loss_mlp": 1.02146304, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 2.9885558338290727, "language_loss": 0.71160913, "learning_rate": 3.952541282603097e-07, "loss": 0.73325408, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.4792304039001465 }, { "auxiliary_loss_clip": 0.01150101, "auxiliary_loss_mlp": 0.01021727, "balance_loss_clip": 1.04842544, "balance_loss_mlp": 1.01458883, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 1.9290309047659324, "language_loss": 0.83466089, "learning_rate": 3.9478934181000013e-07, "loss": 0.85637915, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.503544569015503 }, { "auxiliary_loss_clip": 0.01170032, "auxiliary_loss_mlp": 0.01030372, "balance_loss_clip": 1.050017, "balance_loss_mlp": 1.02296877, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 2.365832707706217, "language_loss": 0.84222734, "learning_rate": 3.943247988688714e-07, "loss": 0.86423135, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.4614522457122803 }, { "auxiliary_loss_clip": 0.01150056, "auxiliary_loss_mlp": 0.01024906, "balance_loss_clip": 1.04495823, "balance_loss_mlp": 1.01781607, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.6478960613640738, "language_loss": 0.7192058, "learning_rate": 3.938604995073933e-07, "loss": 0.74095541, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.518571376800537 }, { "auxiliary_loss_clip": 0.0114096, "auxiliary_loss_mlp": 0.0103284, "balance_loss_clip": 1.04631925, "balance_loss_mlp": 1.02557397, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 1.605553215703312, "language_loss": 0.65403855, "learning_rate": 3.9339644379600157e-07, "loss": 0.67577654, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 3.426527738571167 }, { "auxiliary_loss_clip": 0.01149863, "auxiliary_loss_mlp": 0.01030208, "balance_loss_clip": 1.04743719, "balance_loss_mlp": 1.02325845, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 2.643536410323045, "language_loss": 0.71594584, "learning_rate": 3.929326318050907e-07, "loss": 0.7377466, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.4547054767608643 }, { "auxiliary_loss_clip": 0.01161236, "auxiliary_loss_mlp": 0.01021845, "balance_loss_clip": 1.04605997, "balance_loss_mlp": 1.01514888, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 1.7960272059538764, "language_loss": 0.79191053, "learning_rate": 3.924690636050225e-07, "loss": 0.81374133, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.445206642150879 }, { "auxiliary_loss_clip": 0.011528, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 1.04813957, "balance_loss_mlp": 1.02496529, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 1.8894868394396995, "language_loss": 0.72918546, "learning_rate": 3.9200573926611915e-07, "loss": 0.75103325, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.5514700412750244 }, { "auxiliary_loss_clip": 0.01149317, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.04880595, "balance_loss_mlp": 1.01893485, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 2.0276199517586733, "language_loss": 0.72728634, "learning_rate": 3.9154265885866613e-07, "loss": 0.74903524, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.5415868759155273 }, { "auxiliary_loss_clip": 0.01151347, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.04850113, "balance_loss_mlp": 1.02099061, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 5.854196220570995, "language_loss": 0.74186611, "learning_rate": 3.9107982245291394e-07, "loss": 0.76366818, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 2.476905107498169 }, { "auxiliary_loss_clip": 0.01124317, "auxiliary_loss_mlp": 0.01023802, "balance_loss_clip": 1.04688287, "balance_loss_mlp": 1.01668859, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 1.9915898959440057, "language_loss": 0.77391601, "learning_rate": 3.9061723011907245e-07, "loss": 0.79539716, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.559946060180664 }, { "auxiliary_loss_clip": 0.01136992, "auxiliary_loss_mlp": 0.0102626, "balance_loss_clip": 1.04439902, "balance_loss_mlp": 1.01872897, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 2.0954607479584393, "language_loss": 0.79343569, "learning_rate": 3.901548819273179e-07, "loss": 0.81506819, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 2.5323619842529297 }, { "auxiliary_loss_clip": 0.01151585, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.04761231, "balance_loss_mlp": 1.02142668, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 1.9447530107700597, "language_loss": 0.69353664, "learning_rate": 3.896927779477881e-07, "loss": 0.71533823, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 2.4954659938812256 }, { "auxiliary_loss_clip": 0.01121965, "auxiliary_loss_mlp": 0.01023998, "balance_loss_clip": 1.04195094, "balance_loss_mlp": 1.0173198, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 2.1215832357854043, "language_loss": 0.67175412, "learning_rate": 3.892309182505833e-07, "loss": 0.69321376, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 2.5862650871276855 }, { "auxiliary_loss_clip": 0.01161662, "auxiliary_loss_mlp": 0.0102387, "balance_loss_clip": 1.04596066, "balance_loss_mlp": 1.01703632, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 1.9449071905057924, "language_loss": 0.86011076, "learning_rate": 3.887693029057675e-07, "loss": 0.88196611, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.496030569076538 }, { "auxiliary_loss_clip": 0.01136816, "auxiliary_loss_mlp": 0.01019386, "balance_loss_clip": 1.04496086, "balance_loss_mlp": 1.01280534, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 1.8961911110740348, "language_loss": 0.81279945, "learning_rate": 3.8830793198336684e-07, "loss": 0.83436143, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 4.181029319763184 }, { "auxiliary_loss_clip": 0.01152948, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.04657817, "balance_loss_mlp": 1.02080822, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.5776949225936465, "language_loss": 0.70259988, "learning_rate": 3.878468055533721e-07, "loss": 0.72441268, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 3.3972089290618896 }, { "auxiliary_loss_clip": 0.01132415, "auxiliary_loss_mlp": 0.01035214, "balance_loss_clip": 1.04741144, "balance_loss_mlp": 1.02752233, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 2.4804830211681663, "language_loss": 0.84846658, "learning_rate": 3.8738592368573464e-07, "loss": 0.87014282, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 2.5666115283966064 }, { "auxiliary_loss_clip": 0.01114125, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.04348254, "balance_loss_mlp": 1.0236522, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 1.841292785953817, "language_loss": 0.88128179, "learning_rate": 3.8692528645037137e-07, "loss": 0.90273273, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 2.6131651401519775 }, { "auxiliary_loss_clip": 0.01164159, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.04866779, "balance_loss_mlp": 1.02431548, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.001952394632532, "language_loss": 0.7796886, "learning_rate": 3.8646489391715907e-07, "loss": 0.8016476, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.4217946529388428 }, { "auxiliary_loss_clip": 0.01135809, "auxiliary_loss_mlp": 0.01027924, "balance_loss_clip": 1.04510224, "balance_loss_mlp": 1.02065182, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 4.5463865477578835, "language_loss": 0.87971389, "learning_rate": 3.8600474615593903e-07, "loss": 0.90135121, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.4824376106262207 }, { "auxiliary_loss_clip": 0.01029227, "auxiliary_loss_mlp": 0.0100294, "balance_loss_clip": 1.01031947, "balance_loss_mlp": 1.00174773, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.779508188922491, "language_loss": 0.59740317, "learning_rate": 3.8554484323651605e-07, "loss": 0.61772478, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.220430374145508 }, { "auxiliary_loss_clip": 0.01148319, "auxiliary_loss_mlp": 0.00761062, "balance_loss_clip": 1.04554617, "balance_loss_mlp": 1.00029039, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 1.4743117208008705, "language_loss": 0.79375672, "learning_rate": 3.85085185228657e-07, "loss": 0.81285048, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.518263816833496 }, { "auxiliary_loss_clip": 0.01132258, "auxiliary_loss_mlp": 0.01030495, "balance_loss_clip": 1.04550076, "balance_loss_mlp": 1.02361643, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 1.8142725539054405, "language_loss": 0.73034286, "learning_rate": 3.8462577220209114e-07, "loss": 0.75197041, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.6155641078948975 }, { "auxiliary_loss_clip": 0.0105543, "auxiliary_loss_mlp": 0.0100095, "balance_loss_clip": 1.00794053, "balance_loss_mlp": 0.9998886, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.702858997951354, "language_loss": 0.5902074, "learning_rate": 3.8416660422651127e-07, "loss": 0.61077118, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.152697801589966 }, { "auxiliary_loss_clip": 0.01125314, "auxiliary_loss_mlp": 0.01022617, "balance_loss_clip": 1.04254293, "balance_loss_mlp": 1.01526213, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 1.7597572798502605, "language_loss": 0.67632967, "learning_rate": 3.837076813715723e-07, "loss": 0.69780898, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.5903286933898926 }, { "auxiliary_loss_clip": 0.01119698, "auxiliary_loss_mlp": 0.01027848, "balance_loss_clip": 1.04146528, "balance_loss_mlp": 1.01999247, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 1.742423916132846, "language_loss": 0.75181413, "learning_rate": 3.832490037068941e-07, "loss": 0.77328962, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.563020706176758 }, { "auxiliary_loss_clip": 0.01091064, "auxiliary_loss_mlp": 0.01026101, "balance_loss_clip": 1.03992414, "balance_loss_mlp": 1.01940489, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 1.8379550891336747, "language_loss": 0.75529301, "learning_rate": 3.827905713020554e-07, "loss": 0.7764647, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 2.645224094390869 }, { "auxiliary_loss_clip": 0.0112464, "auxiliary_loss_mlp": 0.01030745, "balance_loss_clip": 1.03975511, "balance_loss_mlp": 1.02241862, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 1.9727058312082406, "language_loss": 0.68778884, "learning_rate": 3.823323842266017e-07, "loss": 0.70934272, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.6198668479919434 }, { "auxiliary_loss_clip": 0.01151788, "auxiliary_loss_mlp": 0.01033394, "balance_loss_clip": 1.04427111, "balance_loss_mlp": 1.02631545, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 2.6559278005139912, "language_loss": 0.72923744, "learning_rate": 3.818744425500393e-07, "loss": 0.75108922, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.526411771774292 }, { "auxiliary_loss_clip": 0.01117412, "auxiliary_loss_mlp": 0.01029744, "balance_loss_clip": 1.04204512, "balance_loss_mlp": 1.0224216, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 1.6525712077960464, "language_loss": 0.80407345, "learning_rate": 3.8141674634183675e-07, "loss": 0.82554501, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 3.369621515274048 }, { "auxiliary_loss_clip": 0.01106729, "auxiliary_loss_mlp": 0.01028758, "balance_loss_clip": 1.04418302, "balance_loss_mlp": 1.02177227, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 2.0343958009564957, "language_loss": 0.66459018, "learning_rate": 3.809592956714278e-07, "loss": 0.68594503, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.645833969116211 }, { "auxiliary_loss_clip": 0.01157221, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.04933286, "balance_loss_mlp": 1.0267067, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 1.8477915980738315, "language_loss": 0.74499363, "learning_rate": 3.805020906082057e-07, "loss": 0.76690781, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.496737003326416 }, { "auxiliary_loss_clip": 0.01138799, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.04427445, "balance_loss_mlp": 1.02125561, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 2.060984940040578, "language_loss": 0.80828321, "learning_rate": 3.8004513122152917e-07, "loss": 0.82996076, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.5241291522979736 }, { "auxiliary_loss_clip": 0.01124897, "auxiliary_loss_mlp": 0.01026539, "balance_loss_clip": 1.04355729, "balance_loss_mlp": 1.01997626, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 1.7450470920517303, "language_loss": 0.67172277, "learning_rate": 3.79588417580718e-07, "loss": 0.69323719, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.547727108001709 }, { "auxiliary_loss_clip": 0.01152235, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.0465771, "balance_loss_mlp": 1.02228403, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 1.83534674388313, "language_loss": 0.76346213, "learning_rate": 3.791319497550558e-07, "loss": 0.78527784, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.47678804397583 }, { "auxiliary_loss_clip": 0.01126549, "auxiliary_loss_mlp": 0.00760902, "balance_loss_clip": 1.04563761, "balance_loss_mlp": 1.00025964, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 1.903325463263411, "language_loss": 0.70772928, "learning_rate": 3.78675727813788e-07, "loss": 0.72660381, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 2.5286831855773926 }, { "auxiliary_loss_clip": 0.01137828, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.04566443, "balance_loss_mlp": 1.02161372, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.6049888872539804, "language_loss": 0.73669183, "learning_rate": 3.782197518261225e-07, "loss": 0.75835925, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 2.542816638946533 }, { "auxiliary_loss_clip": 0.01144603, "auxiliary_loss_mlp": 0.01024824, "balance_loss_clip": 1.04745007, "balance_loss_mlp": 1.01786804, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 1.937689601658259, "language_loss": 0.95622593, "learning_rate": 3.777640218612319e-07, "loss": 0.97792017, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 2.5099611282348633 }, { "auxiliary_loss_clip": 0.01145868, "auxiliary_loss_mlp": 0.01024339, "balance_loss_clip": 1.04698896, "balance_loss_mlp": 1.01722217, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.5282700047701567, "language_loss": 0.72120076, "learning_rate": 3.773085379882488e-07, "loss": 0.74290276, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 2.4920568466186523 }, { "auxiliary_loss_clip": 0.01152537, "auxiliary_loss_mlp": 0.0076128, "balance_loss_clip": 1.04582644, "balance_loss_mlp": 1.00032365, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 1.926765461329152, "language_loss": 0.75744176, "learning_rate": 3.768533002762715e-07, "loss": 0.77657992, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 2.627150535583496 }, { "auxiliary_loss_clip": 0.01135223, "auxiliary_loss_mlp": 0.01025903, "balance_loss_clip": 1.04274833, "balance_loss_mlp": 1.01931715, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.645101991601082, "language_loss": 0.76655716, "learning_rate": 3.763983087943572e-07, "loss": 0.78816843, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 4.181788444519043 }, { "auxiliary_loss_clip": 0.01142821, "auxiliary_loss_mlp": 0.00760995, "balance_loss_clip": 1.04484391, "balance_loss_mlp": 1.00030255, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 1.6026081494774203, "language_loss": 0.80991662, "learning_rate": 3.759435636115282e-07, "loss": 0.82895476, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 3.1999378204345703 }, { "auxiliary_loss_clip": 0.01088327, "auxiliary_loss_mlp": 0.00760457, "balance_loss_clip": 1.04193711, "balance_loss_mlp": 1.00029731, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 1.704914704259896, "language_loss": 0.72834814, "learning_rate": 3.7548906479676967e-07, "loss": 0.74683595, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 2.6426594257354736 }, { "auxiliary_loss_clip": 0.01153823, "auxiliary_loss_mlp": 0.0102417, "balance_loss_clip": 1.04604197, "balance_loss_mlp": 1.01717198, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 1.5988757626103358, "language_loss": 0.71594125, "learning_rate": 3.7503481241902855e-07, "loss": 0.7377212, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.51042103767395 }, { "auxiliary_loss_clip": 0.01137502, "auxiliary_loss_mlp": 0.00760262, "balance_loss_clip": 1.045223, "balance_loss_mlp": 1.00028014, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 1.5924042388100152, "language_loss": 0.80211306, "learning_rate": 3.745808065472145e-07, "loss": 0.8210907, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.4997830390930176 }, { "auxiliary_loss_clip": 0.01147754, "auxiliary_loss_mlp": 0.01027129, "balance_loss_clip": 1.04965103, "balance_loss_mlp": 1.02073944, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 1.4719904616025614, "language_loss": 0.76315981, "learning_rate": 3.741270472501994e-07, "loss": 0.78490865, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.5011088848114014 }, { "auxiliary_loss_clip": 0.01135769, "auxiliary_loss_mlp": 0.01024271, "balance_loss_clip": 1.04733884, "balance_loss_mlp": 1.0172075, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 1.618210669297089, "language_loss": 0.72948617, "learning_rate": 3.736735345968183e-07, "loss": 0.75108659, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.5314385890960693 }, { "auxiliary_loss_clip": 0.01152405, "auxiliary_loss_mlp": 0.01028242, "balance_loss_clip": 1.04780173, "balance_loss_mlp": 1.02115798, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 1.5907601751374836, "language_loss": 0.78560448, "learning_rate": 3.7322026865586986e-07, "loss": 0.80741096, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.458688974380493 }, { "auxiliary_loss_clip": 0.011605, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.05203772, "balance_loss_mlp": 1.01898909, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 1.8972568071915916, "language_loss": 0.73357159, "learning_rate": 3.7276724949611206e-07, "loss": 0.75543815, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.520411729812622 }, { "auxiliary_loss_clip": 0.01140305, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 1.04695344, "balance_loss_mlp": 1.02400637, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 1.8306855338065091, "language_loss": 0.7516911, "learning_rate": 3.723144771862694e-07, "loss": 0.77341133, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.5508809089660645 }, { "auxiliary_loss_clip": 0.01126359, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.0433315, "balance_loss_mlp": 1.02262902, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 1.5236435885284427, "language_loss": 0.76609969, "learning_rate": 3.718619517950263e-07, "loss": 0.78765601, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.5840864181518555 }, { "auxiliary_loss_clip": 0.01166679, "auxiliary_loss_mlp": 0.0102606, "balance_loss_clip": 1.05063558, "balance_loss_mlp": 1.01946223, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 1.7449578403070263, "language_loss": 0.76797557, "learning_rate": 3.714096733910301e-07, "loss": 0.78990299, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.4257378578186035 }, { "auxiliary_loss_clip": 0.01156273, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.04830873, "balance_loss_mlp": 1.02041137, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 1.971728595241666, "language_loss": 0.70091099, "learning_rate": 3.709576420428926e-07, "loss": 0.72275031, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.5234479904174805 }, { "auxiliary_loss_clip": 0.01136417, "auxiliary_loss_mlp": 0.01023025, "balance_loss_clip": 1.04202986, "balance_loss_mlp": 1.01636732, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 1.970576635328107, "language_loss": 0.73199034, "learning_rate": 3.7050585781918463e-07, "loss": 0.75358474, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.558981418609619 }, { "auxiliary_loss_clip": 0.01154172, "auxiliary_loss_mlp": 0.01032518, "balance_loss_clip": 1.04801977, "balance_loss_mlp": 1.02524877, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 1.9345798547360664, "language_loss": 0.69021749, "learning_rate": 3.700543207884428e-07, "loss": 0.71208435, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.46881103515625 }, { "auxiliary_loss_clip": 0.01151473, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.04837394, "balance_loss_mlp": 1.02124608, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 1.6996815326324561, "language_loss": 0.70940953, "learning_rate": 3.6960303101916466e-07, "loss": 0.73120612, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 3.2852981090545654 }, { "auxiliary_loss_clip": 0.01055522, "auxiliary_loss_mlp": 0.00751323, "balance_loss_clip": 1.00796902, "balance_loss_mlp": 1.00009811, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7618086512161657, "language_loss": 0.55576146, "learning_rate": 3.6915198857981047e-07, "loss": 0.57382989, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.10363507270813 }, { "auxiliary_loss_clip": 0.01118688, "auxiliary_loss_mlp": 0.01031525, "balance_loss_clip": 1.04278994, "balance_loss_mlp": 1.02448559, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 1.8236806996200032, "language_loss": 0.68096799, "learning_rate": 3.687011935388027e-07, "loss": 0.70247012, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.608764410018921 }, { "auxiliary_loss_clip": 0.01149085, "auxiliary_loss_mlp": 0.01029726, "balance_loss_clip": 1.04638386, "balance_loss_mlp": 1.02237332, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 2.167976816476336, "language_loss": 0.72803485, "learning_rate": 3.6825064596452646e-07, "loss": 0.74982303, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.530694007873535 }, { "auxiliary_loss_clip": 0.01151344, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 1.04611301, "balance_loss_mlp": 1.02226162, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 1.539155486737868, "language_loss": 0.70544779, "learning_rate": 3.678003459253305e-07, "loss": 0.72724867, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.5403409004211426 }, { "auxiliary_loss_clip": 0.01124446, "auxiliary_loss_mlp": 0.0102607, "balance_loss_clip": 1.0455519, "balance_loss_mlp": 1.01927519, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 2.491277754960323, "language_loss": 0.74140179, "learning_rate": 3.673502934895236e-07, "loss": 0.76290691, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.5551421642303467 }, { "auxiliary_loss_clip": 0.01055881, "auxiliary_loss_mlp": 0.01003393, "balance_loss_clip": 1.00841248, "balance_loss_mlp": 1.00235558, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.6887222967758427, "language_loss": 0.57960469, "learning_rate": 3.669004887253802e-07, "loss": 0.60019743, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 3.195160150527954 }, { "auxiliary_loss_clip": 0.01141629, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 1.0475893, "balance_loss_mlp": 1.0238421, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.7017866657285234, "language_loss": 0.78836453, "learning_rate": 3.664509317011335e-07, "loss": 0.81008446, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 2.5371413230895996 }, { "auxiliary_loss_clip": 0.01152692, "auxiliary_loss_mlp": 0.01028601, "balance_loss_clip": 1.04989457, "balance_loss_mlp": 1.02091825, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 1.7709280712288527, "language_loss": 0.74110758, "learning_rate": 3.6600162248498134e-07, "loss": 0.7629205, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 2.567059278488159 }, { "auxiliary_loss_clip": 0.01078657, "auxiliary_loss_mlp": 0.01024554, "balance_loss_clip": 1.03702044, "balance_loss_mlp": 1.01806319, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 1.8157210869276579, "language_loss": 0.75805569, "learning_rate": 3.6555256114508426e-07, "loss": 0.77908778, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 2.635758638381958 }, { "auxiliary_loss_clip": 0.01136741, "auxiliary_loss_mlp": 0.01029549, "balance_loss_clip": 1.04206276, "balance_loss_mlp": 1.02228606, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 2.0783908195107736, "language_loss": 0.72613406, "learning_rate": 3.651037477495642e-07, "loss": 0.74779689, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 2.575836181640625 }, { "auxiliary_loss_clip": 0.01163842, "auxiliary_loss_mlp": 0.01029328, "balance_loss_clip": 1.04745913, "balance_loss_mlp": 1.02207422, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 1.8101517450497882, "language_loss": 0.68040943, "learning_rate": 3.6465518236650584e-07, "loss": 0.70234114, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 4.061638832092285 }, { "auxiliary_loss_clip": 0.01123312, "auxiliary_loss_mlp": 0.01023713, "balance_loss_clip": 1.04241037, "balance_loss_mlp": 1.01691473, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.5725752626689806, "language_loss": 0.7821238, "learning_rate": 3.642068650639558e-07, "loss": 0.80359405, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 2.5929312705993652 }, { "auxiliary_loss_clip": 0.01128984, "auxiliary_loss_mlp": 0.01024416, "balance_loss_clip": 1.04025817, "balance_loss_mlp": 1.0175916, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 1.6979431590537866, "language_loss": 0.64317226, "learning_rate": 3.6375879590992334e-07, "loss": 0.66470623, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 2.56786847114563 }, { "auxiliary_loss_clip": 0.0113379, "auxiliary_loss_mlp": 0.01024721, "balance_loss_clip": 1.04592228, "balance_loss_mlp": 1.01730299, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 1.799478112163573, "language_loss": 0.81111926, "learning_rate": 3.6331097497238173e-07, "loss": 0.83270431, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.538419008255005 }, { "auxiliary_loss_clip": 0.01123682, "auxiliary_loss_mlp": 0.01026351, "balance_loss_clip": 1.04448152, "balance_loss_mlp": 1.02002132, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 1.9542343161377356, "language_loss": 0.79800558, "learning_rate": 3.628634023192627e-07, "loss": 0.81950593, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.580892324447632 }, { "auxiliary_loss_clip": 0.01152705, "auxiliary_loss_mlp": 0.01022049, "balance_loss_clip": 1.0462873, "balance_loss_mlp": 1.01482487, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 8.988291761890526, "language_loss": 0.75489938, "learning_rate": 3.624160780184644e-07, "loss": 0.77664691, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.4571259021759033 }, { "auxiliary_loss_clip": 0.01133144, "auxiliary_loss_mlp": 0.01025759, "balance_loss_clip": 1.04535651, "balance_loss_mlp": 1.01826715, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 1.5834022826619456, "language_loss": 0.74227405, "learning_rate": 3.6196900213784496e-07, "loss": 0.76386315, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.542808771133423 }, { "auxiliary_loss_clip": 0.01151554, "auxiliary_loss_mlp": 0.01027883, "balance_loss_clip": 1.04640019, "balance_loss_mlp": 1.02112949, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 1.883831502598574, "language_loss": 0.86661965, "learning_rate": 3.6152217474522527e-07, "loss": 0.88841397, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.4763739109039307 }, { "auxiliary_loss_clip": 0.01152566, "auxiliary_loss_mlp": 0.01021474, "balance_loss_clip": 1.04987192, "balance_loss_mlp": 1.01476598, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 1.4925523394449782, "language_loss": 0.72677743, "learning_rate": 3.6107559590838975e-07, "loss": 0.74851781, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.533613681793213 }, { "auxiliary_loss_clip": 0.01087644, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.03838658, "balance_loss_mlp": 1.02435815, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.467950668594518, "language_loss": 0.66181672, "learning_rate": 3.606292656950822e-07, "loss": 0.68300986, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.625168561935425 }, { "auxiliary_loss_clip": 0.0112998, "auxiliary_loss_mlp": 0.01029176, "balance_loss_clip": 1.04189444, "balance_loss_mlp": 1.02159762, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 3.4462262494780127, "language_loss": 0.86629862, "learning_rate": 3.601831841730121e-07, "loss": 0.88789022, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.515833854675293 }, { "auxiliary_loss_clip": 0.01150318, "auxiliary_loss_mlp": 0.01025291, "balance_loss_clip": 1.04660249, "balance_loss_mlp": 1.01807272, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 1.581954657765605, "language_loss": 0.72588289, "learning_rate": 3.5973735140984916e-07, "loss": 0.74763894, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.5380849838256836 }, { "auxiliary_loss_clip": 0.01104501, "auxiliary_loss_mlp": 0.00760812, "balance_loss_clip": 1.04093456, "balance_loss_mlp": 1.00024033, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 1.8245109382896405, "language_loss": 0.79391062, "learning_rate": 3.5929176747322607e-07, "loss": 0.81256378, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 2.6144087314605713 }, { "auxiliary_loss_clip": 0.01037767, "auxiliary_loss_mlp": 0.01000695, "balance_loss_clip": 1.00821829, "balance_loss_mlp": 0.99963969, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.818770455586398, "language_loss": 0.56304932, "learning_rate": 3.588464324307372e-07, "loss": 0.58343399, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.16375732421875 }, { "auxiliary_loss_clip": 0.01151771, "auxiliary_loss_mlp": 0.01028759, "balance_loss_clip": 1.0447278, "balance_loss_mlp": 1.02141309, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 1.7051218188005042, "language_loss": 0.75398338, "learning_rate": 3.584013463499391e-07, "loss": 0.77578866, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 2.48321795463562 }, { "auxiliary_loss_clip": 0.01039315, "auxiliary_loss_mlp": 0.00999642, "balance_loss_clip": 1.01176035, "balance_loss_mlp": 0.99853289, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.7378537864920456, "language_loss": 0.6448555, "learning_rate": 3.579565092983521e-07, "loss": 0.66524506, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.781142473220825 }, { "auxiliary_loss_clip": 0.0116515, "auxiliary_loss_mlp": 0.01033846, "balance_loss_clip": 1.04942262, "balance_loss_mlp": 1.02675903, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 1.9962546057621966, "language_loss": 0.83676046, "learning_rate": 3.575119213434565e-07, "loss": 0.85875034, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.4570813179016113 }, { "auxiliary_loss_clip": 0.01151727, "auxiliary_loss_mlp": 0.0102495, "balance_loss_clip": 1.04844308, "balance_loss_mlp": 1.01777399, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 1.7695805055040827, "language_loss": 0.81749421, "learning_rate": 3.5706758255269765e-07, "loss": 0.83926105, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.4993133544921875 }, { "auxiliary_loss_clip": 0.01142559, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.04732573, "balance_loss_mlp": 1.02483797, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 2.0578144999559935, "language_loss": 0.69767725, "learning_rate": 3.566234929934795e-07, "loss": 0.71942049, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.5252883434295654 }, { "auxiliary_loss_clip": 0.01147126, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.04710674, "balance_loss_mlp": 1.02359581, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.596941612907656, "language_loss": 0.71791756, "learning_rate": 3.561796527331706e-07, "loss": 0.7396934, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.534383773803711 }, { "auxiliary_loss_clip": 0.01123581, "auxiliary_loss_mlp": 0.0102466, "balance_loss_clip": 1.04295051, "balance_loss_mlp": 1.01790082, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 1.891489208631019, "language_loss": 0.77819705, "learning_rate": 3.5573606183910163e-07, "loss": 0.79967952, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.5945050716400146 }, { "auxiliary_loss_clip": 0.0115632, "auxiliary_loss_mlp": 0.0102808, "balance_loss_clip": 1.04561031, "balance_loss_mlp": 1.0207572, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 1.7507876507399072, "language_loss": 0.78691852, "learning_rate": 3.5529272037856493e-07, "loss": 0.80876249, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 2.516690492630005 }, { "auxiliary_loss_clip": 0.0101233, "auxiliary_loss_mlp": 0.01000116, "balance_loss_clip": 1.00964868, "balance_loss_mlp": 0.9989835, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.709413191553818, "language_loss": 0.53910118, "learning_rate": 3.548496284188149e-07, "loss": 0.55922568, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 3.2818145751953125 }, { "auxiliary_loss_clip": 0.01104555, "auxiliary_loss_mlp": 0.01026683, "balance_loss_clip": 1.04551721, "balance_loss_mlp": 1.02014422, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 5.560505104844581, "language_loss": 0.79016185, "learning_rate": 3.544067860270681e-07, "loss": 0.81147426, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 2.719434976577759 }, { "auxiliary_loss_clip": 0.01124448, "auxiliary_loss_mlp": 0.01027776, "balance_loss_clip": 1.04329753, "balance_loss_mlp": 1.02047718, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 1.5919296591999543, "language_loss": 0.70750415, "learning_rate": 3.539641932705029e-07, "loss": 0.72902638, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 2.5440261363983154 }, { "auxiliary_loss_clip": 0.01167195, "auxiliary_loss_mlp": 0.01026836, "balance_loss_clip": 1.04766536, "balance_loss_mlp": 1.0191884, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.1650447136805737, "language_loss": 0.77138084, "learning_rate": 3.53521850216262e-07, "loss": 0.79332113, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 2.4680590629577637 }, { "auxiliary_loss_clip": 0.01165743, "auxiliary_loss_mlp": 0.01033154, "balance_loss_clip": 1.04956222, "balance_loss_mlp": 1.02561438, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 2.344517666628521, "language_loss": 0.76946139, "learning_rate": 3.530797569314461e-07, "loss": 0.79145038, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 3.6066341400146484 }, { "auxiliary_loss_clip": 0.01164379, "auxiliary_loss_mlp": 0.01031328, "balance_loss_clip": 1.04797709, "balance_loss_mlp": 1.0240978, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 1.7332947765378452, "language_loss": 0.77709574, "learning_rate": 3.5263791348312235e-07, "loss": 0.79905283, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 3.20334792137146 }, { "auxiliary_loss_clip": 0.01134449, "auxiliary_loss_mlp": 0.01027207, "balance_loss_clip": 1.04436159, "balance_loss_mlp": 1.01991153, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 1.7127113297632148, "language_loss": 0.70757997, "learning_rate": 3.521963199383171e-07, "loss": 0.72919655, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 2.6040656566619873 }, { "auxiliary_loss_clip": 0.01109011, "auxiliary_loss_mlp": 0.01026551, "balance_loss_clip": 1.04203653, "balance_loss_mlp": 1.01852834, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 2.0914741343791996, "language_loss": 0.76594824, "learning_rate": 3.517549763640197e-07, "loss": 0.78730381, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.616095542907715 }, { "auxiliary_loss_clip": 0.01149709, "auxiliary_loss_mlp": 0.00760684, "balance_loss_clip": 1.04857063, "balance_loss_mlp": 1.00026846, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 1.716762393948832, "language_loss": 0.7085669, "learning_rate": 3.513138828271829e-07, "loss": 0.72767091, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.5563526153564453 }, { "auxiliary_loss_clip": 0.01117001, "auxiliary_loss_mlp": 0.01024273, "balance_loss_clip": 1.04289639, "balance_loss_mlp": 1.01793098, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 2.8984066415245184, "language_loss": 0.69872916, "learning_rate": 3.508730393947179e-07, "loss": 0.72014195, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.6939961910247803 }, { "auxiliary_loss_clip": 0.01125326, "auxiliary_loss_mlp": 0.01023774, "balance_loss_clip": 1.04822803, "balance_loss_mlp": 1.01735437, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 1.7386539981088658, "language_loss": 0.71851611, "learning_rate": 3.504324461335024e-07, "loss": 0.74000704, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 2.564802885055542 }, { "auxiliary_loss_clip": 0.01099527, "auxiliary_loss_mlp": 0.01026524, "balance_loss_clip": 1.0393703, "balance_loss_mlp": 1.0192579, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 2.191220537628639, "language_loss": 0.88013035, "learning_rate": 3.499921031103732e-07, "loss": 0.90139091, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.6176445484161377 }, { "auxiliary_loss_clip": 0.01127747, "auxiliary_loss_mlp": 0.01023461, "balance_loss_clip": 1.04220748, "balance_loss_mlp": 1.01640129, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.630796501862283, "language_loss": 0.7833156, "learning_rate": 3.4955201039212987e-07, "loss": 0.80482769, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.5870580673217773 }, { "auxiliary_loss_clip": 0.01151766, "auxiliary_loss_mlp": 0.01032086, "balance_loss_clip": 1.04703748, "balance_loss_mlp": 1.02486753, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 2.083598147556066, "language_loss": 0.65456808, "learning_rate": 3.4911216804553465e-07, "loss": 0.67640662, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.4784457683563232 }, { "auxiliary_loss_clip": 0.01136192, "auxiliary_loss_mlp": 0.01025903, "balance_loss_clip": 1.04501462, "balance_loss_mlp": 1.01852703, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 1.840022980978433, "language_loss": 0.70380008, "learning_rate": 3.4867257613731017e-07, "loss": 0.72542107, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.520625591278076 }, { "auxiliary_loss_clip": 0.01139684, "auxiliary_loss_mlp": 0.01027055, "balance_loss_clip": 1.0448817, "balance_loss_mlp": 1.02023363, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 1.9377258407557598, "language_loss": 0.8550638, "learning_rate": 3.4823323473414343e-07, "loss": 0.87673122, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.50909686088562 }, { "auxiliary_loss_clip": 0.01125726, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.04460657, "balance_loss_mlp": 1.02048957, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 2.068153244170498, "language_loss": 0.76184547, "learning_rate": 3.477941439026812e-07, "loss": 0.78338623, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.5577714443206787 }, { "auxiliary_loss_clip": 0.01135389, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.04782701, "balance_loss_mlp": 1.02332294, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 1.775856162213348, "language_loss": 0.73086643, "learning_rate": 3.473553037095349e-07, "loss": 0.75252193, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.489179849624634 }, { "auxiliary_loss_clip": 0.01131427, "auxiliary_loss_mlp": 0.01032215, "balance_loss_clip": 1.0441339, "balance_loss_mlp": 1.02540481, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 1.689369790478916, "language_loss": 0.83334225, "learning_rate": 3.469167142212743e-07, "loss": 0.85497868, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.5664689540863037 }, { "auxiliary_loss_clip": 0.01152065, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.04850936, "balance_loss_mlp": 1.02345026, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 2.1672605136354846, "language_loss": 0.63251305, "learning_rate": 3.4647837550443337e-07, "loss": 0.65433931, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 3.3715970516204834 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.01025467, "balance_loss_clip": 1.04431021, "balance_loss_mlp": 1.01814795, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 2.4830154440441685, "language_loss": 0.74504256, "learning_rate": 3.460402876255086e-07, "loss": 0.76653945, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.536693811416626 }, { "auxiliary_loss_clip": 0.0115476, "auxiliary_loss_mlp": 0.01028891, "balance_loss_clip": 1.04780483, "balance_loss_mlp": 1.02202487, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 2.5239171649958467, "language_loss": 0.7185452, "learning_rate": 3.456024506509574e-07, "loss": 0.74038172, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.5285286903381348 }, { "auxiliary_loss_clip": 0.01150049, "auxiliary_loss_mlp": 0.00760824, "balance_loss_clip": 1.04705393, "balance_loss_mlp": 1.00023758, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 1.505612895754183, "language_loss": 0.73600054, "learning_rate": 3.4516486464719873e-07, "loss": 0.75510925, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.534344434738159 }, { "auxiliary_loss_clip": 0.01104304, "auxiliary_loss_mlp": 0.01029443, "balance_loss_clip": 1.04094911, "balance_loss_mlp": 1.02235007, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 1.792305967465305, "language_loss": 0.62114483, "learning_rate": 3.4472752968061445e-07, "loss": 0.64248228, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.70717453956604 }, { "auxiliary_loss_clip": 0.01153084, "auxiliary_loss_mlp": 0.01021324, "balance_loss_clip": 1.04647362, "balance_loss_mlp": 1.01423097, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 1.799602755052315, "language_loss": 0.73584104, "learning_rate": 3.442904458175475e-07, "loss": 0.75758517, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.4612183570861816 }, { "auxiliary_loss_clip": 0.01147999, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.04408431, "balance_loss_mlp": 1.02189755, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 1.4950397508550584, "language_loss": 0.76229119, "learning_rate": 3.438536131243044e-07, "loss": 0.7840668, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 2.576498508453369 }, { "auxiliary_loss_clip": 0.01141827, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.04599166, "balance_loss_mlp": 1.02104235, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 2.2249442790910683, "language_loss": 0.62193418, "learning_rate": 3.434170316671503e-07, "loss": 0.64363849, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 2.6674773693084717 }, { "auxiliary_loss_clip": 0.01121033, "auxiliary_loss_mlp": 0.01024138, "balance_loss_clip": 1.0479759, "balance_loss_mlp": 1.01733065, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 2.243991278622339, "language_loss": 0.89954734, "learning_rate": 3.4298070151231583e-07, "loss": 0.92099905, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 2.5472185611724854 }, { "auxiliary_loss_clip": 0.01142311, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.04588342, "balance_loss_mlp": 1.02272677, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 1.802792638527892, "language_loss": 0.59682214, "learning_rate": 3.425446227259916e-07, "loss": 0.61853969, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 2.571389675140381 }, { "auxiliary_loss_clip": 0.01137973, "auxiliary_loss_mlp": 0.01026695, "balance_loss_clip": 1.04461789, "balance_loss_mlp": 1.02038026, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 1.8719098817674136, "language_loss": 0.82111728, "learning_rate": 3.421087953743296e-07, "loss": 0.84276396, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.563957691192627 }, { "auxiliary_loss_clip": 0.01150136, "auxiliary_loss_mlp": 0.01025388, "balance_loss_clip": 1.04419184, "balance_loss_mlp": 1.01784849, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 2.1546609669312846, "language_loss": 0.80113047, "learning_rate": 3.416732195234464e-07, "loss": 0.82288575, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 4.050412893295288 }, { "auxiliary_loss_clip": 0.01152731, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.04563594, "balance_loss_mlp": 1.0241518, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.496879881042609, "language_loss": 0.79234207, "learning_rate": 3.4123789523941613e-07, "loss": 0.8141818, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 2.4817593097686768 }, { "auxiliary_loss_clip": 0.01141613, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 1.04194415, "balance_loss_mlp": 1.02294755, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 1.5346526064888462, "language_loss": 0.63283229, "learning_rate": 3.4080282258827884e-07, "loss": 0.65455282, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 2.4972455501556396 }, { "auxiliary_loss_clip": 0.0115562, "auxiliary_loss_mlp": 0.01025782, "balance_loss_clip": 1.0471822, "balance_loss_mlp": 1.01857579, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 2.06796628126924, "language_loss": 0.72356987, "learning_rate": 3.403680016360342e-07, "loss": 0.74538392, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.484015703201294 }, { "auxiliary_loss_clip": 0.0114594, "auxiliary_loss_mlp": 0.01024617, "balance_loss_clip": 1.04778814, "balance_loss_mlp": 1.01761591, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 1.4579927137009172, "language_loss": 0.67463434, "learning_rate": 3.3993343244864403e-07, "loss": 0.69633985, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.48726487159729 }, { "auxiliary_loss_clip": 0.01149933, "auxiliary_loss_mlp": 0.01027877, "balance_loss_clip": 1.04656386, "balance_loss_mlp": 1.02130008, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 1.4730682539015059, "language_loss": 0.72492278, "learning_rate": 3.394991150920323e-07, "loss": 0.74670088, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.547252893447876 }, { "auxiliary_loss_clip": 0.01112818, "auxiliary_loss_mlp": 0.00761696, "balance_loss_clip": 1.04452348, "balance_loss_mlp": 1.00028348, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 2.055649009633026, "language_loss": 0.74143159, "learning_rate": 3.3906504963208396e-07, "loss": 0.76017678, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.5501158237457275 }, { "auxiliary_loss_clip": 0.01105179, "auxiliary_loss_mlp": 0.01025162, "balance_loss_clip": 1.04329848, "balance_loss_mlp": 1.01783991, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 1.8999517073164542, "language_loss": 0.6653204, "learning_rate": 3.3863123613464774e-07, "loss": 0.68662381, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.5917744636535645 }, { "auxiliary_loss_clip": 0.01134959, "auxiliary_loss_mlp": 0.01021384, "balance_loss_clip": 1.03994441, "balance_loss_mlp": 1.01457071, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 1.5819923857465883, "language_loss": 0.75060475, "learning_rate": 3.381976746655317e-07, "loss": 0.77216816, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.539149522781372 }, { "auxiliary_loss_clip": 0.01104498, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 1.04713869, "balance_loss_mlp": 1.02336168, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 1.8890408002682166, "language_loss": 0.66964477, "learning_rate": 3.3776436529050756e-07, "loss": 0.69099414, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.5756068229675293 }, { "auxiliary_loss_clip": 0.01162288, "auxiliary_loss_mlp": 0.01025139, "balance_loss_clip": 1.04683077, "balance_loss_mlp": 1.01823068, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.6717131366120992, "language_loss": 0.72862494, "learning_rate": 3.373313080753073e-07, "loss": 0.75049925, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.5665318965911865 }, { "auxiliary_loss_clip": 0.01144085, "auxiliary_loss_mlp": 0.01023697, "balance_loss_clip": 1.04217649, "balance_loss_mlp": 1.01660669, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 1.734407486016436, "language_loss": 0.77299374, "learning_rate": 3.3689850308562527e-07, "loss": 0.79467154, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.5004937648773193 }, { "auxiliary_loss_clip": 0.01102951, "auxiliary_loss_mlp": 0.01024072, "balance_loss_clip": 1.04397893, "balance_loss_mlp": 1.01739669, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 1.6824076174678317, "language_loss": 0.77604461, "learning_rate": 3.364659503871183e-07, "loss": 0.79731488, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.566399574279785 }, { "auxiliary_loss_clip": 0.0112044, "auxiliary_loss_mlp": 0.01020262, "balance_loss_clip": 1.04145527, "balance_loss_mlp": 1.01399767, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 1.8677862605192477, "language_loss": 0.83699358, "learning_rate": 3.3603365004540417e-07, "loss": 0.85840058, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.543564796447754 }, { "auxiliary_loss_clip": 0.01165511, "auxiliary_loss_mlp": 0.01030528, "balance_loss_clip": 1.04925919, "balance_loss_mlp": 1.02335215, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 2.008387377506147, "language_loss": 0.76614594, "learning_rate": 3.356016021260624e-07, "loss": 0.78810632, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.489863872528076 }, { "auxiliary_loss_clip": 0.01152968, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.04792619, "balance_loss_mlp": 1.0200274, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 2.3955410922460496, "language_loss": 0.65548855, "learning_rate": 3.35169806694634e-07, "loss": 0.67728925, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 3.266005277633667 }, { "auxiliary_loss_clip": 0.01022873, "auxiliary_loss_mlp": 0.0100213, "balance_loss_clip": 1.0093224, "balance_loss_mlp": 1.00111663, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.7170233531181492, "language_loss": 0.60645962, "learning_rate": 3.3473826381662186e-07, "loss": 0.62670958, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.279881000518799 }, { "auxiliary_loss_clip": 0.01146391, "auxiliary_loss_mlp": 0.01023215, "balance_loss_clip": 1.04767883, "balance_loss_mlp": 1.01613069, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 1.857789863573793, "language_loss": 0.81561327, "learning_rate": 3.3430697355749216e-07, "loss": 0.83730936, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.4627137184143066 }, { "auxiliary_loss_clip": 0.01105402, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.0410676, "balance_loss_mlp": 1.02570343, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 1.91207718907085, "language_loss": 0.7542488, "learning_rate": 3.3387593598266907e-07, "loss": 0.77563488, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.5692479610443115 }, { "auxiliary_loss_clip": 0.01114699, "auxiliary_loss_mlp": 0.01021324, "balance_loss_clip": 1.04039669, "balance_loss_mlp": 1.01421022, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 1.5540986087225956, "language_loss": 0.78187108, "learning_rate": 3.3344515115754225e-07, "loss": 0.8032313, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.5800628662109375 }, { "auxiliary_loss_clip": 0.01123109, "auxiliary_loss_mlp": 0.01025121, "balance_loss_clip": 1.04266214, "balance_loss_mlp": 1.01782572, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 3.1682263257073973, "language_loss": 0.80145115, "learning_rate": 3.33014619147461e-07, "loss": 0.82293344, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.547830581665039 }, { "auxiliary_loss_clip": 0.01140156, "auxiliary_loss_mlp": 0.0103017, "balance_loss_clip": 1.04951453, "balance_loss_mlp": 1.02248693, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 1.8780589548372724, "language_loss": 0.71717393, "learning_rate": 3.325843400177362e-07, "loss": 0.73887718, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 2.5305545330047607 }, { "auxiliary_loss_clip": 0.01153809, "auxiliary_loss_mlp": 0.00760936, "balance_loss_clip": 1.04655004, "balance_loss_mlp": 1.00028479, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 1.718044440466529, "language_loss": 0.73602092, "learning_rate": 3.32154313833642e-07, "loss": 0.75516832, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 2.5088226795196533 }, { "auxiliary_loss_clip": 0.01165435, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.0475446, "balance_loss_mlp": 1.02103364, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 2.3459036276879646, "language_loss": 0.59545577, "learning_rate": 3.3172454066041164e-07, "loss": 0.6173979, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 2.5126585960388184 }, { "auxiliary_loss_clip": 0.01093438, "auxiliary_loss_mlp": 0.00760079, "balance_loss_clip": 1.0417769, "balance_loss_mlp": 1.00025511, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 1.9010772360846389, "language_loss": 0.76310009, "learning_rate": 3.3129502056324234e-07, "loss": 0.78163528, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 2.6811389923095703 }, { "auxiliary_loss_clip": 0.00998317, "auxiliary_loss_mlp": 0.01002152, "balance_loss_clip": 1.01183498, "balance_loss_mlp": 1.00093043, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.9788672172636785, "language_loss": 0.5976724, "learning_rate": 3.3086575360729165e-07, "loss": 0.61767709, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.158604383468628 }, { "auxiliary_loss_clip": 0.01135642, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.04582453, "balance_loss_mlp": 1.01892579, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 1.619071396711758, "language_loss": 0.71539617, "learning_rate": 3.3043673985767906e-07, "loss": 0.73701853, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 3.8724524974823 }, { "auxiliary_loss_clip": 0.01112507, "auxiliary_loss_mlp": 0.01028755, "balance_loss_clip": 1.04082072, "balance_loss_mlp": 1.02165031, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 1.733740093195914, "language_loss": 0.77560717, "learning_rate": 3.3000797937948564e-07, "loss": 0.79701972, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 3.3193514347076416 }, { "auxiliary_loss_clip": 0.01022646, "auxiliary_loss_mlp": 0.0100292, "balance_loss_clip": 1.00926697, "balance_loss_mlp": 1.00186503, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9274597134168876, "language_loss": 0.64963251, "learning_rate": 3.295794722377534e-07, "loss": 0.66988814, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 3.179305076599121 }, { "auxiliary_loss_clip": 0.01159561, "auxiliary_loss_mlp": 0.01025235, "balance_loss_clip": 1.04500449, "balance_loss_mlp": 1.018345, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 1.4973075312463564, "language_loss": 0.79638612, "learning_rate": 3.291512184974876e-07, "loss": 0.81823409, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.475217819213867 }, { "auxiliary_loss_clip": 0.01133746, "auxiliary_loss_mlp": 0.01029674, "balance_loss_clip": 1.04197192, "balance_loss_mlp": 1.02231884, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 1.6497699352216888, "language_loss": 0.66492373, "learning_rate": 3.2872321822365346e-07, "loss": 0.68655795, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.581940174102783 }, { "auxiliary_loss_clip": 0.0114775, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.04593551, "balance_loss_mlp": 1.01954901, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 1.7508468158134722, "language_loss": 0.73225856, "learning_rate": 3.282954714811783e-07, "loss": 0.75400114, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.4890992641448975 }, { "auxiliary_loss_clip": 0.01123604, "auxiliary_loss_mlp": 0.01023972, "balance_loss_clip": 1.04060197, "balance_loss_mlp": 1.01701951, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.551101378042838, "language_loss": 0.7101748, "learning_rate": 3.2786797833495093e-07, "loss": 0.73165053, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.4846715927124023 }, { "auxiliary_loss_clip": 0.01162042, "auxiliary_loss_mlp": 0.01025972, "balance_loss_clip": 1.04708838, "balance_loss_mlp": 1.01902533, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 1.8982591425304003, "language_loss": 0.72883856, "learning_rate": 3.274407388498213e-07, "loss": 0.75071871, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.4966301918029785 }, { "auxiliary_loss_clip": 0.01116006, "auxiliary_loss_mlp": 0.01017874, "balance_loss_clip": 1.04152691, "balance_loss_mlp": 1.0108223, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 1.7985357007481377, "language_loss": 0.7404412, "learning_rate": 3.270137530906021e-07, "loss": 0.76178002, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.577653408050537 }, { "auxiliary_loss_clip": 0.01101013, "auxiliary_loss_mlp": 0.01023981, "balance_loss_clip": 1.04293096, "balance_loss_mlp": 1.01749647, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 1.938158448934175, "language_loss": 0.83396029, "learning_rate": 3.265870211220665e-07, "loss": 0.85521019, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.5631890296936035 }, { "auxiliary_loss_clip": 0.01118219, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.04302478, "balance_loss_mlp": 1.02410626, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 2.579462168645033, "language_loss": 0.82113957, "learning_rate": 3.2616054300894934e-07, "loss": 0.842641, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.5804059505462646 }, { "auxiliary_loss_clip": 0.01124786, "auxiliary_loss_mlp": 0.01030319, "balance_loss_clip": 1.0433774, "balance_loss_mlp": 1.02311265, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 1.9695810141866232, "language_loss": 0.84545052, "learning_rate": 3.2573431881594693e-07, "loss": 0.86700153, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.5663044452667236 }, { "auxiliary_loss_clip": 0.01092955, "auxiliary_loss_mlp": 0.01021861, "balance_loss_clip": 1.0382359, "balance_loss_mlp": 1.01482153, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 1.9470781660651826, "language_loss": 0.65750802, "learning_rate": 3.2530834860771663e-07, "loss": 0.67865622, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.6149399280548096 }, { "auxiliary_loss_clip": 0.01150593, "auxiliary_loss_mlp": 0.01024935, "balance_loss_clip": 1.04559052, "balance_loss_mlp": 1.01780033, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 1.9227332179090004, "language_loss": 0.74230158, "learning_rate": 3.248826324488794e-07, "loss": 0.7640568, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.475402593612671 }, { "auxiliary_loss_clip": 0.01165325, "auxiliary_loss_mlp": 0.01023703, "balance_loss_clip": 1.0503155, "balance_loss_mlp": 1.0169853, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 1.7356946631938204, "language_loss": 0.87901926, "learning_rate": 3.244571704040138e-07, "loss": 0.90090954, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.4789998531341553 }, { "auxiliary_loss_clip": 0.0114732, "auxiliary_loss_mlp": 0.01026011, "balance_loss_clip": 1.04402959, "balance_loss_mlp": 1.01866829, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 2.5092900202712363, "language_loss": 0.73286933, "learning_rate": 3.2403196253766374e-07, "loss": 0.75460267, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 3.6174943447113037 }, { "auxiliary_loss_clip": 0.01148503, "auxiliary_loss_mlp": 0.01029643, "balance_loss_clip": 1.04620886, "balance_loss_mlp": 1.02179575, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.4784791358861042, "language_loss": 0.789873, "learning_rate": 3.2360700891433254e-07, "loss": 0.81165451, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.558473825454712 }, { "auxiliary_loss_clip": 0.01017003, "auxiliary_loss_mlp": 0.01002488, "balance_loss_clip": 1.0106312, "balance_loss_mlp": 1.00146317, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.8016290706564742, "language_loss": 0.57334048, "learning_rate": 3.231823095984847e-07, "loss": 0.59353542, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.085634469985962 }, { "auxiliary_loss_clip": 0.01136089, "auxiliary_loss_mlp": 0.01026388, "balance_loss_clip": 1.04609787, "balance_loss_mlp": 1.01967645, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 2.190555133817007, "language_loss": 0.75894642, "learning_rate": 3.2275786465454814e-07, "loss": 0.78057116, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.494976282119751 }, { "auxiliary_loss_clip": 0.01120757, "auxiliary_loss_mlp": 0.0102207, "balance_loss_clip": 1.0434984, "balance_loss_mlp": 1.0149529, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 1.634330416389713, "language_loss": 0.75182664, "learning_rate": 3.2233367414690917e-07, "loss": 0.77325487, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.5809335708618164 }, { "auxiliary_loss_clip": 0.01119415, "auxiliary_loss_mlp": 0.01023596, "balance_loss_clip": 1.04212165, "balance_loss_mlp": 1.01709032, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.104766468206614, "language_loss": 0.8487277, "learning_rate": 3.219097381399183e-07, "loss": 0.87015784, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.6004672050476074 }, { "auxiliary_loss_clip": 0.01143641, "auxiliary_loss_mlp": 0.01026429, "balance_loss_clip": 1.04651105, "balance_loss_mlp": 1.01946759, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 1.6922950298044408, "language_loss": 0.81252682, "learning_rate": 3.2148605669788584e-07, "loss": 0.83422756, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 2.5499320030212402 }, { "auxiliary_loss_clip": 0.01140134, "auxiliary_loss_mlp": 0.01025919, "balance_loss_clip": 1.04629135, "balance_loss_mlp": 1.01896369, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.172386854654803, "language_loss": 0.77545398, "learning_rate": 3.2106262988508405e-07, "loss": 0.79711449, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 2.5042037963867188 }, { "auxiliary_loss_clip": 0.01138113, "auxiliary_loss_mlp": 0.01029096, "balance_loss_clip": 1.04552114, "balance_loss_mlp": 1.02191329, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 1.6786229747416912, "language_loss": 0.74013418, "learning_rate": 3.206394577657465e-07, "loss": 0.76180625, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 2.506777763366699 }, { "auxiliary_loss_clip": 0.01155044, "auxiliary_loss_mlp": 0.0102489, "balance_loss_clip": 1.04723024, "balance_loss_mlp": 1.01718903, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.2329974980578755, "language_loss": 0.72520655, "learning_rate": 3.202165404040675e-07, "loss": 0.74700588, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 2.494231939315796 }, { "auxiliary_loss_clip": 0.01093347, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.0417459, "balance_loss_mlp": 1.01771474, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 2.2906272524392084, "language_loss": 0.74515563, "learning_rate": 3.1979387786420396e-07, "loss": 0.76633751, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 3.541156053543091 }, { "auxiliary_loss_clip": 0.01135149, "auxiliary_loss_mlp": 0.01025479, "balance_loss_clip": 1.04108143, "balance_loss_mlp": 1.01845479, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 1.7268959727786048, "language_loss": 0.81789529, "learning_rate": 3.1937147021027346e-07, "loss": 0.83950162, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 3.302060127258301 }, { "auxiliary_loss_clip": 0.01150495, "auxiliary_loss_mlp": 0.01025866, "balance_loss_clip": 1.04745793, "balance_loss_mlp": 1.01888633, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 2.2872735110871476, "language_loss": 0.76380217, "learning_rate": 3.189493175063547e-07, "loss": 0.78556579, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.4687583446502686 }, { "auxiliary_loss_clip": 0.01139044, "auxiliary_loss_mlp": 0.01025159, "balance_loss_clip": 1.04629302, "balance_loss_mlp": 1.01783323, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 2.0049695139086436, "language_loss": 0.66900969, "learning_rate": 3.1852741981648776e-07, "loss": 0.69065177, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 2.5121095180511475 }, { "auxiliary_loss_clip": 0.01109686, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 1.04235137, "balance_loss_mlp": 1.01848602, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 1.860320517922056, "language_loss": 0.69983637, "learning_rate": 3.1810577720467404e-07, "loss": 0.72118878, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 2.612328290939331 }, { "auxiliary_loss_clip": 0.01138714, "auxiliary_loss_mlp": 0.0102345, "balance_loss_clip": 1.04459381, "balance_loss_mlp": 1.01633656, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 1.6355701032923053, "language_loss": 0.56227827, "learning_rate": 3.176843897348769e-07, "loss": 0.58389986, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.6879642009735107 }, { "auxiliary_loss_clip": 0.01131796, "auxiliary_loss_mlp": 0.01028514, "balance_loss_clip": 1.04397655, "balance_loss_mlp": 1.02116442, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 2.7398267244715586, "language_loss": 0.75384313, "learning_rate": 3.1726325747102034e-07, "loss": 0.77544618, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.485578775405884 }, { "auxiliary_loss_clip": 0.01101845, "auxiliary_loss_mlp": 0.01030538, "balance_loss_clip": 1.03677607, "balance_loss_mlp": 1.0230993, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.474646525181093, "language_loss": 0.63907039, "learning_rate": 3.1684238047698974e-07, "loss": 0.66039419, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 2.958782434463501 }, { "auxiliary_loss_clip": 0.011385, "auxiliary_loss_mlp": 0.01029227, "balance_loss_clip": 1.044976, "balance_loss_mlp": 1.02192807, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 2.460208046094842, "language_loss": 0.53089166, "learning_rate": 3.1642175881663155e-07, "loss": 0.55256885, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.584423065185547 }, { "auxiliary_loss_clip": 0.01162184, "auxiliary_loss_mlp": 0.01021517, "balance_loss_clip": 1.04606497, "balance_loss_mlp": 1.01441216, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 2.2317803227243336, "language_loss": 0.84044206, "learning_rate": 3.160013925537537e-07, "loss": 0.86227906, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.468045473098755 }, { "auxiliary_loss_clip": 0.01122142, "auxiliary_loss_mlp": 0.01026732, "balance_loss_clip": 1.04285288, "balance_loss_mlp": 1.01989198, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 2.2926231382498825, "language_loss": 0.75748992, "learning_rate": 3.155812817521266e-07, "loss": 0.77897865, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.5640451908111572 }, { "auxiliary_loss_clip": 0.01143246, "auxiliary_loss_mlp": 0.01027387, "balance_loss_clip": 1.04832053, "balance_loss_mlp": 1.02053273, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 1.826629152210343, "language_loss": 0.78189373, "learning_rate": 3.151614264754787e-07, "loss": 0.80360007, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.5237014293670654 }, { "auxiliary_loss_clip": 0.01163963, "auxiliary_loss_mlp": 0.01025369, "balance_loss_clip": 1.04616857, "balance_loss_mlp": 1.01807094, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 1.9510005595881217, "language_loss": 0.79374212, "learning_rate": 3.147418267875035e-07, "loss": 0.81563544, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.4558022022247314 }, { "auxiliary_loss_clip": 0.01087832, "auxiliary_loss_mlp": 0.00760799, "balance_loss_clip": 1.03655076, "balance_loss_mlp": 1.00028992, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 2.1169697292430163, "language_loss": 0.65284157, "learning_rate": 3.1432248275185315e-07, "loss": 0.67132783, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.6704189777374268 }, { "auxiliary_loss_clip": 0.01149333, "auxiliary_loss_mlp": 0.01024836, "balance_loss_clip": 1.04811621, "balance_loss_mlp": 1.01767182, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 1.9437145071144222, "language_loss": 0.76817179, "learning_rate": 3.139033944321412e-07, "loss": 0.78991348, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.459165096282959 }, { "auxiliary_loss_clip": 0.01153492, "auxiliary_loss_mlp": 0.01026324, "balance_loss_clip": 1.04622531, "balance_loss_mlp": 1.01913011, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 1.6532719057592513, "language_loss": 0.79160112, "learning_rate": 3.1348456189194507e-07, "loss": 0.81339931, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.530111074447632 }, { "auxiliary_loss_clip": 0.01112303, "auxiliary_loss_mlp": 0.01024522, "balance_loss_clip": 1.04065561, "balance_loss_mlp": 1.01766133, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.750713222753815, "language_loss": 0.82620406, "learning_rate": 3.1306598519479876e-07, "loss": 0.84757227, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 3.3977999687194824 }, { "auxiliary_loss_clip": 0.01133552, "auxiliary_loss_mlp": 0.01025781, "balance_loss_clip": 1.04525816, "balance_loss_mlp": 1.01930833, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 1.5828399546271836, "language_loss": 0.78141296, "learning_rate": 3.1264766440420177e-07, "loss": 0.80300629, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.5627734661102295 }, { "auxiliary_loss_clip": 0.01146372, "auxiliary_loss_mlp": 0.01025705, "balance_loss_clip": 1.04616451, "balance_loss_mlp": 1.01862705, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 1.896400830604076, "language_loss": 0.68980575, "learning_rate": 3.122295995836124e-07, "loss": 0.71152651, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.4984629154205322 }, { "auxiliary_loss_clip": 0.01153555, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.04498267, "balance_loss_mlp": 1.01783848, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 1.689114526578596, "language_loss": 0.77192575, "learning_rate": 3.118117907964508e-07, "loss": 0.79371512, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.5498530864715576 }, { "auxiliary_loss_clip": 0.01129824, "auxiliary_loss_mlp": 0.01027635, "balance_loss_clip": 1.04482639, "balance_loss_mlp": 1.02109015, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 1.9237896926429172, "language_loss": 0.80308539, "learning_rate": 3.1139423810609856e-07, "loss": 0.82465994, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.568167209625244 }, { "auxiliary_loss_clip": 0.01163011, "auxiliary_loss_mlp": 0.01020734, "balance_loss_clip": 1.04694128, "balance_loss_mlp": 1.01367998, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 1.9550759246208957, "language_loss": 0.75406456, "learning_rate": 3.1097694157589714e-07, "loss": 0.77590203, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 2.495314121246338 }, { "auxiliary_loss_clip": 0.01152317, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.04896069, "balance_loss_mlp": 1.02261066, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 2.9714693029573525, "language_loss": 0.76619518, "learning_rate": 3.105599012691511e-07, "loss": 0.78801644, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 2.510484218597412 }, { "auxiliary_loss_clip": 0.01145938, "auxiliary_loss_mlp": 0.0102365, "balance_loss_clip": 1.04548979, "balance_loss_mlp": 1.0168997, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 1.4312701539216148, "language_loss": 0.82365167, "learning_rate": 3.101431172491249e-07, "loss": 0.84534752, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 2.559732437133789 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.00760766, "balance_loss_clip": 1.04126549, "balance_loss_mlp": 1.00025725, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 3.08716701889091, "language_loss": 0.72361755, "learning_rate": 3.097265895790444e-07, "loss": 0.74247658, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 2.5346410274505615 }, { "auxiliary_loss_clip": 0.01124763, "auxiliary_loss_mlp": 0.01026462, "balance_loss_clip": 1.04431164, "balance_loss_mlp": 1.01902318, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 1.7133003405348077, "language_loss": 0.82990414, "learning_rate": 3.093103183220962e-07, "loss": 0.85141641, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 2.5667788982391357 }, { "auxiliary_loss_clip": 0.01046511, "auxiliary_loss_mlp": 0.01001626, "balance_loss_clip": 1.00897717, "balance_loss_mlp": 1.00047517, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8201978931200202, "language_loss": 0.59424853, "learning_rate": 3.0889430354142796e-07, "loss": 0.61472988, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.9366321563720703 }, { "auxiliary_loss_clip": 0.01124207, "auxiliary_loss_mlp": 0.01026644, "balance_loss_clip": 1.04185557, "balance_loss_mlp": 1.01967573, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 2.0981164144633255, "language_loss": 0.69977057, "learning_rate": 3.084785453001497e-07, "loss": 0.72127908, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.636141061782837 }, { "auxiliary_loss_clip": 0.01137018, "auxiliary_loss_mlp": 0.00760738, "balance_loss_clip": 1.04740727, "balance_loss_mlp": 1.00029433, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 2.232135199372507, "language_loss": 0.81604826, "learning_rate": 3.080630436613314e-07, "loss": 0.83502591, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 3.2671101093292236 }, { "auxiliary_loss_clip": 0.0114188, "auxiliary_loss_mlp": 0.01024109, "balance_loss_clip": 1.04434478, "balance_loss_mlp": 1.01761484, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 1.9390269680909522, "language_loss": 0.85696697, "learning_rate": 3.076477986880039e-07, "loss": 0.87862682, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 2.488128662109375 }, { "auxiliary_loss_clip": 0.0113611, "auxiliary_loss_mlp": 0.01026244, "balance_loss_clip": 1.04614544, "balance_loss_mlp": 1.01928186, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 1.968577843498901, "language_loss": 0.69294572, "learning_rate": 3.0723281044315986e-07, "loss": 0.71456927, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 2.56144118309021 }, { "auxiliary_loss_clip": 0.01162419, "auxiliary_loss_mlp": 0.01025503, "balance_loss_clip": 1.04715431, "balance_loss_mlp": 1.01919675, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 1.9238378969905074, "language_loss": 0.76437068, "learning_rate": 3.068180789897521e-07, "loss": 0.78624988, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.4342234134674072 }, { "auxiliary_loss_clip": 0.01153114, "auxiliary_loss_mlp": 0.01024365, "balance_loss_clip": 1.04554033, "balance_loss_mlp": 1.01724792, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 2.480386747495741, "language_loss": 0.81311107, "learning_rate": 3.064036043906966e-07, "loss": 0.83488584, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.5871052742004395 }, { "auxiliary_loss_clip": 0.01130077, "auxiliary_loss_mlp": 0.01029186, "balance_loss_clip": 1.04416645, "balance_loss_mlp": 1.02196217, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 1.9132545019362743, "language_loss": 0.67835331, "learning_rate": 3.059893867088668e-07, "loss": 0.69994593, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 2.7533822059631348 }, { "auxiliary_loss_clip": 0.01147926, "auxiliary_loss_mlp": 0.01025764, "balance_loss_clip": 1.04546595, "balance_loss_mlp": 1.01878381, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 1.8799802502088918, "language_loss": 0.66675746, "learning_rate": 3.055754260071004e-07, "loss": 0.68849444, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.5612289905548096 }, { "auxiliary_loss_clip": 0.01150996, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.0471034, "balance_loss_mlp": 1.02094233, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 2.0347420030660373, "language_loss": 0.73398846, "learning_rate": 3.051617223481948e-07, "loss": 0.75577623, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.533963441848755 }, { "auxiliary_loss_clip": 0.01131193, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.04524362, "balance_loss_mlp": 1.0262928, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 1.951111925215039, "language_loss": 0.74954289, "learning_rate": 3.047482757949078e-07, "loss": 0.77119917, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.5471670627593994 }, { "auxiliary_loss_clip": 0.01119687, "auxiliary_loss_mlp": 0.00759808, "balance_loss_clip": 1.04186463, "balance_loss_mlp": 1.00023389, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 1.9237567424870434, "language_loss": 0.85930359, "learning_rate": 3.043350864099605e-07, "loss": 0.87809849, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.5626373291015625 }, { "auxiliary_loss_clip": 0.01152492, "auxiliary_loss_mlp": 0.01024267, "balance_loss_clip": 1.04589725, "balance_loss_mlp": 1.01732898, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.136965144662471, "language_loss": 0.80607367, "learning_rate": 3.039221542560315e-07, "loss": 0.82784128, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.4555037021636963 }, { "auxiliary_loss_clip": 0.01147946, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.0468514, "balance_loss_mlp": 1.02206123, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 1.9765585519574091, "language_loss": 0.73803556, "learning_rate": 3.0350947939576356e-07, "loss": 0.75980735, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.487490177154541 }, { "auxiliary_loss_clip": 0.01153298, "auxiliary_loss_mlp": 0.01030552, "balance_loss_clip": 1.046067, "balance_loss_mlp": 1.02368248, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 1.5444719206469868, "language_loss": 0.72195923, "learning_rate": 3.0309706189175876e-07, "loss": 0.74379778, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 2.4765918254852295 }, { "auxiliary_loss_clip": 0.01037273, "auxiliary_loss_mlp": 0.01001254, "balance_loss_clip": 1.00892115, "balance_loss_mlp": 1.00016963, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.7549451449970025, "language_loss": 0.57411456, "learning_rate": 3.0268490180658045e-07, "loss": 0.59449983, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.1077535152435303 }, { "auxiliary_loss_clip": 0.01167039, "auxiliary_loss_mlp": 0.01026644, "balance_loss_clip": 1.04962456, "balance_loss_mlp": 1.01978099, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.1112361289801926, "language_loss": 0.79203427, "learning_rate": 3.0227299920275305e-07, "loss": 0.81397104, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 3.244901657104492 }, { "auxiliary_loss_clip": 0.01125859, "auxiliary_loss_mlp": 0.01029371, "balance_loss_clip": 1.04635489, "balance_loss_mlp": 1.02135456, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 1.8358536585927385, "language_loss": 0.85394937, "learning_rate": 3.018613541427613e-07, "loss": 0.87550163, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.542297601699829 }, { "auxiliary_loss_clip": 0.0115983, "auxiliary_loss_mlp": 0.01027034, "balance_loss_clip": 1.04453945, "balance_loss_mlp": 1.01976788, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.6007068669677438, "language_loss": 0.73481977, "learning_rate": 3.0144996668905243e-07, "loss": 0.75668842, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.4133596420288086 }, { "auxiliary_loss_clip": 0.01094927, "auxiliary_loss_mlp": 0.00760537, "balance_loss_clip": 1.03768885, "balance_loss_mlp": 1.00028276, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 1.9028374440675995, "language_loss": 0.81809688, "learning_rate": 3.010388369040331e-07, "loss": 0.83665156, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.6176748275756836 }, { "auxiliary_loss_clip": 0.01150986, "auxiliary_loss_mlp": 0.01024603, "balance_loss_clip": 1.04751623, "balance_loss_mlp": 1.0180589, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 1.792220357310396, "language_loss": 0.82707846, "learning_rate": 3.0062796485007156e-07, "loss": 0.8488344, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.5821361541748047 }, { "auxiliary_loss_clip": 0.0116457, "auxiliary_loss_mlp": 0.00761067, "balance_loss_clip": 1.04783356, "balance_loss_mlp": 1.00027943, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 2.2684641108547, "language_loss": 0.65847683, "learning_rate": 3.002173505894965e-07, "loss": 0.67773318, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 2.4991135597229004 }, { "auxiliary_loss_clip": 0.01154074, "auxiliary_loss_mlp": 0.01030438, "balance_loss_clip": 1.04579234, "balance_loss_mlp": 1.02285624, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 3.3767732423336883, "language_loss": 0.62769824, "learning_rate": 2.998069941845973e-07, "loss": 0.64954329, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 2.4892897605895996 }, { "auxiliary_loss_clip": 0.01055727, "auxiliary_loss_mlp": 0.01001909, "balance_loss_clip": 1.00843453, "balance_loss_mlp": 1.00085449, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.7200024382137162, "language_loss": 0.57480383, "learning_rate": 2.993968956976258e-07, "loss": 0.59538031, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 3.163667917251587 }, { "auxiliary_loss_clip": 0.01171216, "auxiliary_loss_mlp": 0.01029209, "balance_loss_clip": 1.0500226, "balance_loss_mlp": 1.0217073, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 3.091984748916916, "language_loss": 0.70207721, "learning_rate": 2.9898705519079313e-07, "loss": 0.7240814, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 2.4864370822906494 }, { "auxiliary_loss_clip": 0.01128602, "auxiliary_loss_mlp": 0.01027917, "balance_loss_clip": 1.04358435, "balance_loss_mlp": 1.02096665, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 1.6647851795656414, "language_loss": 0.74570096, "learning_rate": 2.985774727262715e-07, "loss": 0.76726609, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 2.5299339294433594 }, { "auxiliary_loss_clip": 0.01160823, "auxiliary_loss_mlp": 0.01024228, "balance_loss_clip": 1.04553318, "balance_loss_mlp": 1.0178504, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 1.9060100923375793, "language_loss": 0.81304991, "learning_rate": 2.981681483661949e-07, "loss": 0.8349005, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 3.4981472492218018 }, { "auxiliary_loss_clip": 0.01155037, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.05027318, "balance_loss_mlp": 1.01990116, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 1.6215569305227406, "language_loss": 0.71172404, "learning_rate": 2.9775908217265633e-07, "loss": 0.73354948, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 3.5501115322113037 }, { "auxiliary_loss_clip": 0.01003458, "auxiliary_loss_mlp": 0.01001412, "balance_loss_clip": 1.0084033, "balance_loss_mlp": 1.0001725, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8331935919363206, "language_loss": 0.50378245, "learning_rate": 2.9735027420771253e-07, "loss": 0.52383113, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.91984224319458 }, { "auxiliary_loss_clip": 0.01131157, "auxiliary_loss_mlp": 0.01024407, "balance_loss_clip": 1.04752326, "balance_loss_mlp": 1.01839304, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 1.9590605177554137, "language_loss": 0.71374846, "learning_rate": 2.969417245333774e-07, "loss": 0.73530412, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 2.6585938930511475 }, { "auxiliary_loss_clip": 0.01117782, "auxiliary_loss_mlp": 0.01024332, "balance_loss_clip": 1.04494917, "balance_loss_mlp": 1.01727462, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 3.4282192890561656, "language_loss": 0.78123349, "learning_rate": 2.9653343321162915e-07, "loss": 0.80265462, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.577820062637329 }, { "auxiliary_loss_clip": 0.01122583, "auxiliary_loss_mlp": 0.01024708, "balance_loss_clip": 1.04525363, "balance_loss_mlp": 1.01737666, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 1.9838705493857713, "language_loss": 0.64902973, "learning_rate": 2.9612540030440446e-07, "loss": 0.67050266, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 2.6214756965637207 }, { "auxiliary_loss_clip": 0.01033665, "auxiliary_loss_mlp": 0.01000572, "balance_loss_clip": 1.00673175, "balance_loss_mlp": 0.9994694, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.8464807689237711, "language_loss": 0.64179438, "learning_rate": 2.9571762587360206e-07, "loss": 0.66213673, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.0558977127075195 }, { "auxiliary_loss_clip": 0.01103883, "auxiliary_loss_mlp": 0.01030223, "balance_loss_clip": 1.03654015, "balance_loss_mlp": 1.0236094, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 2.0206873145923416, "language_loss": 0.73894012, "learning_rate": 2.953101099810806e-07, "loss": 0.76028121, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.6441383361816406 }, { "auxiliary_loss_clip": 0.01145763, "auxiliary_loss_mlp": 0.01021909, "balance_loss_clip": 1.04888701, "balance_loss_mlp": 1.01503992, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 2.1478537478170656, "language_loss": 0.82131624, "learning_rate": 2.9490285268865965e-07, "loss": 0.8429929, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.492135763168335 }, { "auxiliary_loss_clip": 0.01154189, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.04733765, "balance_loss_mlp": 1.02475286, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 2.3107081566827485, "language_loss": 0.79741418, "learning_rate": 2.9449585405812085e-07, "loss": 0.8192811, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.5346853733062744 }, { "auxiliary_loss_clip": 0.01121745, "auxiliary_loss_mlp": 0.01028252, "balance_loss_clip": 1.04369259, "balance_loss_mlp": 1.02146554, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 1.8782173375178988, "language_loss": 0.73798847, "learning_rate": 2.940891141512043e-07, "loss": 0.7594884, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.5450196266174316 }, { "auxiliary_loss_clip": 0.0113324, "auxiliary_loss_mlp": 0.01025455, "balance_loss_clip": 1.04243922, "balance_loss_mlp": 1.01838899, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 2.6230782085655955, "language_loss": 0.72024798, "learning_rate": 2.9368263302961385e-07, "loss": 0.74183488, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.498473882675171 }, { "auxiliary_loss_clip": 0.0109203, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.03817129, "balance_loss_mlp": 1.01986027, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 1.8993583861618317, "language_loss": 0.7983768, "learning_rate": 2.9327641075501075e-07, "loss": 0.81956649, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 2.6744396686553955 }, { "auxiliary_loss_clip": 0.01129353, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.04136133, "balance_loss_mlp": 1.0244292, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 2.5212758564022013, "language_loss": 0.66277796, "learning_rate": 2.9287044738901866e-07, "loss": 0.68439162, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.6235599517822266 }, { "auxiliary_loss_clip": 0.01150842, "auxiliary_loss_mlp": 0.00760629, "balance_loss_clip": 1.04539132, "balance_loss_mlp": 1.00028038, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 2.2314383744752413, "language_loss": 0.90753531, "learning_rate": 2.9246474299322274e-07, "loss": 0.92665005, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 2.4742605686187744 }, { "auxiliary_loss_clip": 0.01020243, "auxiliary_loss_mlp": 0.01000786, "balance_loss_clip": 1.00860071, "balance_loss_mlp": 0.99966592, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.8873314165248043, "language_loss": 0.63125604, "learning_rate": 2.920592976291678e-07, "loss": 0.65146631, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.0706613063812256 }, { "auxiliary_loss_clip": 0.01149451, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.04566824, "balance_loss_mlp": 1.02185798, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 2.0331773594740223, "language_loss": 0.80727381, "learning_rate": 2.916541113583595e-07, "loss": 0.82905883, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 3.323946952819824 }, { "auxiliary_loss_clip": 0.01122882, "auxiliary_loss_mlp": 0.01024199, "balance_loss_clip": 1.0453825, "balance_loss_mlp": 1.01727307, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 1.9454694964846262, "language_loss": 0.66138637, "learning_rate": 2.912491842422642e-07, "loss": 0.68285716, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.528140068054199 }, { "auxiliary_loss_clip": 0.01150433, "auxiliary_loss_mlp": 0.01025002, "balance_loss_clip": 1.04592276, "balance_loss_mlp": 1.01850557, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 1.678789161350107, "language_loss": 0.71107662, "learning_rate": 2.9084451634230857e-07, "loss": 0.732831, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.498622179031372 }, { "auxiliary_loss_clip": 0.01121876, "auxiliary_loss_mlp": 0.01027846, "balance_loss_clip": 1.04239082, "balance_loss_mlp": 1.02012086, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 2.183609824292139, "language_loss": 0.7159611, "learning_rate": 2.9044010771988125e-07, "loss": 0.73745829, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.6435840129852295 }, { "auxiliary_loss_clip": 0.0112886, "auxiliary_loss_mlp": 0.01026248, "balance_loss_clip": 1.04348421, "balance_loss_mlp": 1.01959658, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 1.9655659806882637, "language_loss": 0.71944344, "learning_rate": 2.900359584363303e-07, "loss": 0.74099457, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.7420504093170166 }, { "auxiliary_loss_clip": 0.01106974, "auxiliary_loss_mlp": 0.01036115, "balance_loss_clip": 1.04603755, "balance_loss_mlp": 1.02818155, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.1619751453930878, "language_loss": 0.84580094, "learning_rate": 2.8963206855296494e-07, "loss": 0.86723185, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 2.5486273765563965 }, { "auxiliary_loss_clip": 0.01149279, "auxiliary_loss_mlp": 0.01025599, "balance_loss_clip": 1.04451215, "balance_loss_mlp": 1.01896191, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 2.097963353352098, "language_loss": 0.76986897, "learning_rate": 2.892284381310548e-07, "loss": 0.79161775, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 2.5238144397735596 }, { "auxiliary_loss_clip": 0.0113235, "auxiliary_loss_mlp": 0.01027978, "balance_loss_clip": 1.04495144, "balance_loss_mlp": 1.02018166, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 2.408191634394838, "language_loss": 0.72620606, "learning_rate": 2.888250672318302e-07, "loss": 0.74780935, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.520975112915039 }, { "auxiliary_loss_clip": 0.01166618, "auxiliary_loss_mlp": 0.0102371, "balance_loss_clip": 1.04920876, "balance_loss_mlp": 1.01633358, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 1.8791478253207499, "language_loss": 0.68471563, "learning_rate": 2.884219559164831e-07, "loss": 0.70661896, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.6202375888824463 }, { "auxiliary_loss_clip": 0.01149717, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.04734325, "balance_loss_mlp": 1.0198462, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 1.7738973017407877, "language_loss": 0.81305653, "learning_rate": 2.880191042461635e-07, "loss": 0.83482373, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 2.4635233879089355 }, { "auxiliary_loss_clip": 0.01114242, "auxiliary_loss_mlp": 0.01024882, "balance_loss_clip": 1.0418452, "balance_loss_mlp": 1.01843584, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 1.8041463115212717, "language_loss": 0.80084991, "learning_rate": 2.876165122819849e-07, "loss": 0.82224119, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 4.1805949211120605 }, { "auxiliary_loss_clip": 0.01161583, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04801238, "balance_loss_mlp": 1.01795888, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 1.504507128361725, "language_loss": 0.791767, "learning_rate": 2.872141800850201e-07, "loss": 0.81363499, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.4639930725097656 }, { "auxiliary_loss_clip": 0.0116218, "auxiliary_loss_mlp": 0.01026709, "balance_loss_clip": 1.0470798, "balance_loss_mlp": 1.02017963, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 1.5905806187787586, "language_loss": 0.72978544, "learning_rate": 2.868121077163024e-07, "loss": 0.75167429, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 3.2682645320892334 }, { "auxiliary_loss_clip": 0.01150252, "auxiliary_loss_mlp": 0.01027764, "balance_loss_clip": 1.04374492, "balance_loss_mlp": 1.02046287, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 1.906763391193866, "language_loss": 0.71792132, "learning_rate": 2.864102952368257e-07, "loss": 0.73970151, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 2.4722516536712646 }, { "auxiliary_loss_clip": 0.01098295, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.03768682, "balance_loss_mlp": 1.02039528, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 1.2155282795429059, "language_loss": 0.5930047, "learning_rate": 2.860087427075444e-07, "loss": 0.61426443, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.7123942375183105 }, { "auxiliary_loss_clip": 0.01132716, "auxiliary_loss_mlp": 0.01030122, "balance_loss_clip": 1.04466772, "balance_loss_mlp": 1.02339244, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 3.957038101064704, "language_loss": 0.86509615, "learning_rate": 2.856074501893744e-07, "loss": 0.88672453, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.4870588779449463 }, { "auxiliary_loss_clip": 0.0115197, "auxiliary_loss_mlp": 0.01025658, "balance_loss_clip": 1.04890585, "balance_loss_mlp": 1.01857352, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 1.7068402110295868, "language_loss": 0.81339824, "learning_rate": 2.8520641774319054e-07, "loss": 0.8351745, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 2.469676971435547 }, { "auxiliary_loss_clip": 0.0113349, "auxiliary_loss_mlp": 0.01024716, "balance_loss_clip": 1.03932095, "balance_loss_mlp": 1.01715195, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 2.1214569376378263, "language_loss": 0.76454782, "learning_rate": 2.848056454298309e-07, "loss": 0.78612989, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.490370035171509 }, { "auxiliary_loss_clip": 0.0113642, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.04678035, "balance_loss_mlp": 1.02801251, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 2.1437969134455073, "language_loss": 0.654728, "learning_rate": 2.844051333100905e-07, "loss": 0.6764406, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.507584571838379 }, { "auxiliary_loss_clip": 0.01137791, "auxiliary_loss_mlp": 0.01029014, "balance_loss_clip": 1.04812276, "balance_loss_mlp": 1.02288985, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 1.784572527459324, "language_loss": 0.83658791, "learning_rate": 2.840048814447269e-07, "loss": 0.85825598, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.5012803077697754 }, { "auxiliary_loss_clip": 0.01127364, "auxiliary_loss_mlp": 0.010278, "balance_loss_clip": 1.04226565, "balance_loss_mlp": 1.02040315, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 2.2468238284084556, "language_loss": 0.74125546, "learning_rate": 2.836048898944587e-07, "loss": 0.76280713, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.5041511058807373 }, { "auxiliary_loss_clip": 0.01135174, "auxiliary_loss_mlp": 0.01023014, "balance_loss_clip": 1.04357815, "balance_loss_mlp": 1.01635885, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 2.9272970814453, "language_loss": 0.72563815, "learning_rate": 2.832051587199642e-07, "loss": 0.74721992, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.53576922416687 }, { "auxiliary_loss_clip": 0.01046519, "auxiliary_loss_mlp": 0.01000502, "balance_loss_clip": 1.00814819, "balance_loss_mlp": 0.99932194, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.8606813897775577, "language_loss": 0.57772636, "learning_rate": 2.828056879818821e-07, "loss": 0.59819663, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 3.035710573196411 }, { "auxiliary_loss_clip": 0.01119193, "auxiliary_loss_mlp": 0.01028614, "balance_loss_clip": 1.0387013, "balance_loss_mlp": 1.0221616, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 1.9893297660813498, "language_loss": 0.83579826, "learning_rate": 2.824064777408117e-07, "loss": 0.85727632, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.5999209880828857 }, { "auxiliary_loss_clip": 0.01150146, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.04775047, "balance_loss_mlp": 1.02205062, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 1.8010123581805222, "language_loss": 0.75971317, "learning_rate": 2.8200752805731263e-07, "loss": 0.78150737, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 2.582643747329712 }, { "auxiliary_loss_clip": 0.01150903, "auxiliary_loss_mlp": 0.01034754, "balance_loss_clip": 1.04791331, "balance_loss_mlp": 1.02744341, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 1.4657558330022697, "language_loss": 0.80977416, "learning_rate": 2.8160883899190625e-07, "loss": 0.83163083, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 3.3680598735809326 }, { "auxiliary_loss_clip": 0.01113268, "auxiliary_loss_mlp": 0.01028677, "balance_loss_clip": 1.04452181, "balance_loss_mlp": 1.02194142, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 1.9791017051973514, "language_loss": 0.73281473, "learning_rate": 2.8121041060507234e-07, "loss": 0.75423419, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.5991439819335938 }, { "auxiliary_loss_clip": 0.01152498, "auxiliary_loss_mlp": 0.01022477, "balance_loss_clip": 1.04510832, "balance_loss_mlp": 1.01560163, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.6533816019507215, "language_loss": 0.7149142, "learning_rate": 2.808122429572528e-07, "loss": 0.73666394, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.525315284729004 }, { "auxiliary_loss_clip": 0.01131314, "auxiliary_loss_mlp": 0.01024999, "balance_loss_clip": 1.04516673, "balance_loss_mlp": 1.01838017, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 3.002769587587452, "language_loss": 0.75834548, "learning_rate": 2.804143361088489e-07, "loss": 0.77990866, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.5601181983947754 }, { "auxiliary_loss_clip": 0.0113066, "auxiliary_loss_mlp": 0.01023569, "balance_loss_clip": 1.04403853, "balance_loss_mlp": 1.01682746, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.1184543043852875, "language_loss": 0.77812815, "learning_rate": 2.8001669012022277e-07, "loss": 0.7996704, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.5500922203063965 }, { "auxiliary_loss_clip": 0.0115069, "auxiliary_loss_mlp": 0.01027115, "balance_loss_clip": 1.04904294, "balance_loss_mlp": 1.02050233, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 2.88573513958506, "language_loss": 0.69206262, "learning_rate": 2.7961930505169795e-07, "loss": 0.71384066, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.574700355529785 }, { "auxiliary_loss_clip": 0.01152558, "auxiliary_loss_mlp": 0.00760741, "balance_loss_clip": 1.04690421, "balance_loss_mlp": 1.00028253, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 1.813701949470426, "language_loss": 0.76458108, "learning_rate": 2.792221809635558e-07, "loss": 0.78371412, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 2.531498908996582 }, { "auxiliary_loss_clip": 0.01081154, "auxiliary_loss_mlp": 0.01024117, "balance_loss_clip": 1.04009283, "balance_loss_mlp": 1.01641011, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 2.0243868426321003, "language_loss": 0.75070429, "learning_rate": 2.788253179160411e-07, "loss": 0.77175701, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 2.6595914363861084 }, { "auxiliary_loss_clip": 0.011373, "auxiliary_loss_mlp": 0.01028937, "balance_loss_clip": 1.04593992, "balance_loss_mlp": 1.02230597, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 1.9969684922585191, "language_loss": 0.6487689, "learning_rate": 2.7842871596935725e-07, "loss": 0.6704312, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.4938418865203857 }, { "auxiliary_loss_clip": 0.01150638, "auxiliary_loss_mlp": 0.01027064, "balance_loss_clip": 1.0451777, "balance_loss_mlp": 1.01998329, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 1.5891884532658413, "language_loss": 0.69336224, "learning_rate": 2.780323751836682e-07, "loss": 0.71513927, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.5338375568389893 }, { "auxiliary_loss_clip": 0.01134334, "auxiliary_loss_mlp": 0.00760414, "balance_loss_clip": 1.04177403, "balance_loss_mlp": 1.00030375, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 1.418877767900213, "language_loss": 0.78774226, "learning_rate": 2.7763629561909876e-07, "loss": 0.80668974, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 2.5455806255340576 }, { "auxiliary_loss_clip": 0.01161892, "auxiliary_loss_mlp": 0.0102449, "balance_loss_clip": 1.04652619, "balance_loss_mlp": 1.01733696, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 1.9011594481347391, "language_loss": 0.77043235, "learning_rate": 2.772404773357335e-07, "loss": 0.79229623, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 4.042009353637695 }, { "auxiliary_loss_clip": 0.01113059, "auxiliary_loss_mlp": 0.01023524, "balance_loss_clip": 1.04204798, "balance_loss_mlp": 1.0166868, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 2.310157871185444, "language_loss": 0.78437167, "learning_rate": 2.7684492039361853e-07, "loss": 0.8057375, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.573755979537964 }, { "auxiliary_loss_clip": 0.01166374, "auxiliary_loss_mlp": 0.01026772, "balance_loss_clip": 1.05042684, "balance_loss_mlp": 1.02005124, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 1.7395999244525835, "language_loss": 0.83807564, "learning_rate": 2.764496248527586e-07, "loss": 0.86000705, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 3.2029471397399902 }, { "auxiliary_loss_clip": 0.01125033, "auxiliary_loss_mlp": 0.01024412, "balance_loss_clip": 1.04242051, "balance_loss_mlp": 1.01739645, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 1.7428694759786143, "language_loss": 0.7845186, "learning_rate": 2.760545907731211e-07, "loss": 0.80601299, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.5964114665985107 }, { "auxiliary_loss_clip": 0.01149679, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.04467058, "balance_loss_mlp": 1.02465999, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 1.8009366962808282, "language_loss": 0.67856836, "learning_rate": 2.75659818214631e-07, "loss": 0.70038342, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 2.5559446811676025 }, { "auxiliary_loss_clip": 0.01138956, "auxiliary_loss_mlp": 0.01025753, "balance_loss_clip": 1.04455447, "balance_loss_mlp": 1.019238, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 2.6500068444593965, "language_loss": 0.77958357, "learning_rate": 2.752653072371749e-07, "loss": 0.80123061, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.530442476272583 }, { "auxiliary_loss_clip": 0.0112091, "auxiliary_loss_mlp": 0.01028468, "balance_loss_clip": 1.04623795, "balance_loss_mlp": 1.02167916, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 1.788053440864496, "language_loss": 0.74744362, "learning_rate": 2.7487105790060105e-07, "loss": 0.76893741, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.600433588027954 }, { "auxiliary_loss_clip": 0.0115145, "auxiliary_loss_mlp": 0.01021378, "balance_loss_clip": 1.0458554, "balance_loss_mlp": 1.01520872, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 1.818640304444648, "language_loss": 0.69312578, "learning_rate": 2.7447707026471587e-07, "loss": 0.714854, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 2.6609978675842285 }, { "auxiliary_loss_clip": 0.01123124, "auxiliary_loss_mlp": 0.01023147, "balance_loss_clip": 1.04177511, "balance_loss_mlp": 1.01642418, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 1.8959171807103095, "language_loss": 0.79701591, "learning_rate": 2.740833443892874e-07, "loss": 0.81847858, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.5894551277160645 }, { "auxiliary_loss_clip": 0.01136616, "auxiliary_loss_mlp": 0.01028376, "balance_loss_clip": 1.044541, "balance_loss_mlp": 1.02161694, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 2.1104840003381518, "language_loss": 0.79624593, "learning_rate": 2.7368988033404327e-07, "loss": 0.81789583, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.541539430618286 }, { "auxiliary_loss_clip": 0.0112583, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 1.04454756, "balance_loss_mlp": 1.02158189, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.6385348365862804, "language_loss": 0.84315002, "learning_rate": 2.732966781586712e-07, "loss": 0.8646903, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.6140875816345215 }, { "auxiliary_loss_clip": 0.01145139, "auxiliary_loss_mlp": 0.01027795, "balance_loss_clip": 1.04406095, "balance_loss_mlp": 1.02091074, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 1.7950866027382386, "language_loss": 0.66726971, "learning_rate": 2.729037379228205e-07, "loss": 0.688999, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.5265390872955322 }, { "auxiliary_loss_clip": 0.01138429, "auxiliary_loss_mlp": 0.01027031, "balance_loss_clip": 1.04806042, "balance_loss_mlp": 1.02020037, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.4139741463418498, "language_loss": 0.80470526, "learning_rate": 2.725110596860998e-07, "loss": 0.82635981, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.570114850997925 }, { "auxiliary_loss_clip": 0.01106071, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.04348159, "balance_loss_mlp": 1.0191133, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 1.972801501213052, "language_loss": 0.69863671, "learning_rate": 2.7211864350807776e-07, "loss": 0.71995533, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.6092357635498047 }, { "auxiliary_loss_clip": 0.01163251, "auxiliary_loss_mlp": 0.01025626, "balance_loss_clip": 1.04637194, "balance_loss_mlp": 1.01835775, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.5544537257518154, "language_loss": 0.73609275, "learning_rate": 2.717264894482836e-07, "loss": 0.75798148, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 2.4989330768585205 }, { "auxiliary_loss_clip": 0.01151094, "auxiliary_loss_mlp": 0.01024668, "balance_loss_clip": 1.04618549, "balance_loss_mlp": 1.01760197, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 1.918830587723856, "language_loss": 0.81022978, "learning_rate": 2.7133459756620646e-07, "loss": 0.83198744, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 3.310995578765869 }, { "auxiliary_loss_clip": 0.01143135, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.04398012, "balance_loss_mlp": 1.02059793, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 1.6041692642114038, "language_loss": 0.73805106, "learning_rate": 2.7094296792129733e-07, "loss": 0.75976199, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.486720323562622 }, { "auxiliary_loss_clip": 0.01151637, "auxiliary_loss_mlp": 0.01028177, "balance_loss_clip": 1.04652119, "balance_loss_mlp": 1.02155757, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 1.9026981064982034, "language_loss": 0.75457311, "learning_rate": 2.7055160057296424e-07, "loss": 0.77637124, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.4502720832824707 }, { "auxiliary_loss_clip": 0.01123433, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.04403269, "balance_loss_mlp": 1.02098513, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 1.5840589112968788, "language_loss": 0.72228789, "learning_rate": 2.7016049558057896e-07, "loss": 0.74380422, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.6322102546691895 }, { "auxiliary_loss_clip": 0.01150453, "auxiliary_loss_mlp": 0.01021675, "balance_loss_clip": 1.04693413, "balance_loss_mlp": 1.01480246, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 1.7121631329994285, "language_loss": 0.70734161, "learning_rate": 2.6976965300347074e-07, "loss": 0.72906291, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.5537824630737305 }, { "auxiliary_loss_clip": 0.01130853, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.04211807, "balance_loss_mlp": 1.02081454, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.623288530221996, "language_loss": 0.69551104, "learning_rate": 2.693790729009309e-07, "loss": 0.71709502, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.5764079093933105 }, { "auxiliary_loss_clip": 0.01136774, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 1.04501462, "balance_loss_mlp": 1.01781166, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 1.8445608458355607, "language_loss": 0.88643444, "learning_rate": 2.6898875533220946e-07, "loss": 0.90804702, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 2.537450075149536 }, { "auxiliary_loss_clip": 0.0115787, "auxiliary_loss_mlp": 0.01023598, "balance_loss_clip": 1.04652369, "balance_loss_mlp": 1.01798296, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 1.6668767143367138, "language_loss": 0.81733429, "learning_rate": 2.685987003565171e-07, "loss": 0.839149, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 2.467628240585327 }, { "auxiliary_loss_clip": 0.0111473, "auxiliary_loss_mlp": 0.01025713, "balance_loss_clip": 1.04528964, "balance_loss_mlp": 1.01885259, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 2.2374621660147356, "language_loss": 0.74824572, "learning_rate": 2.6820890803302566e-07, "loss": 0.76965016, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.5271248817443848 }, { "auxiliary_loss_clip": 0.01131001, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.04591012, "balance_loss_mlp": 1.02188444, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.1670283431241186, "language_loss": 0.81884503, "learning_rate": 2.6781937842086557e-07, "loss": 0.84044176, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.507235288619995 }, { "auxiliary_loss_clip": 0.01151762, "auxiliary_loss_mlp": 0.01025713, "balance_loss_clip": 1.04600441, "balance_loss_mlp": 1.01951706, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 1.7386135731704802, "language_loss": 0.670784, "learning_rate": 2.6743011157912933e-07, "loss": 0.69255871, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 2.4929585456848145 }, { "auxiliary_loss_clip": 0.01105293, "auxiliary_loss_mlp": 0.01029168, "balance_loss_clip": 1.03767157, "balance_loss_mlp": 1.02221262, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 1.625313833064709, "language_loss": 0.65178227, "learning_rate": 2.6704110756686725e-07, "loss": 0.67312694, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 4.276062726974487 }, { "auxiliary_loss_clip": 0.01131385, "auxiliary_loss_mlp": 0.00760849, "balance_loss_clip": 1.04269838, "balance_loss_mlp": 1.00028884, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 2.0240438074934057, "language_loss": 0.83553338, "learning_rate": 2.6665236644309085e-07, "loss": 0.85445571, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.564086675643921 }, { "auxiliary_loss_clip": 0.01151874, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.0463798, "balance_loss_mlp": 1.02205777, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 3.0469221625996585, "language_loss": 0.79535228, "learning_rate": 2.662638882667727e-07, "loss": 0.81716025, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 3.259716272354126 }, { "auxiliary_loss_clip": 0.01166989, "auxiliary_loss_mlp": 0.01028922, "balance_loss_clip": 1.04767907, "balance_loss_mlp": 1.02187395, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 1.986111163350732, "language_loss": 0.72994471, "learning_rate": 2.658756730968443e-07, "loss": 0.75190377, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 2.4829554557800293 }, { "auxiliary_loss_clip": 0.01139097, "auxiliary_loss_mlp": 0.01027462, "balance_loss_clip": 1.0458889, "balance_loss_mlp": 1.02038097, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 3.259294049439094, "language_loss": 0.88360351, "learning_rate": 2.654877209921975e-07, "loss": 0.90526903, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.5188701152801514 }, { "auxiliary_loss_clip": 0.01114379, "auxiliary_loss_mlp": 0.01025455, "balance_loss_clip": 1.04080653, "balance_loss_mlp": 1.01832891, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 2.3642599401974844, "language_loss": 0.62424564, "learning_rate": 2.651000320116843e-07, "loss": 0.64564395, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 2.707624673843384 }, { "auxiliary_loss_clip": 0.01119548, "auxiliary_loss_mlp": 0.00761361, "balance_loss_clip": 1.04291844, "balance_loss_mlp": 1.00027096, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 2.135165426897522, "language_loss": 0.75812829, "learning_rate": 2.647126062141163e-07, "loss": 0.77693737, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.5651581287384033 }, { "auxiliary_loss_clip": 0.01136044, "auxiliary_loss_mlp": 0.01020847, "balance_loss_clip": 1.03957915, "balance_loss_mlp": 1.01393247, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 2.039692373166702, "language_loss": 0.83960247, "learning_rate": 2.643254436582669e-07, "loss": 0.86117136, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.527170419692993 }, { "auxiliary_loss_clip": 0.01110751, "auxiliary_loss_mlp": 0.01028875, "balance_loss_clip": 1.04404676, "balance_loss_mlp": 1.02208889, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 1.7970901710996285, "language_loss": 0.822752, "learning_rate": 2.6393854440286743e-07, "loss": 0.84414822, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.6289656162261963 }, { "auxiliary_loss_clip": 0.01165747, "auxiliary_loss_mlp": 0.01026242, "balance_loss_clip": 1.05024433, "balance_loss_mlp": 1.01934004, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 1.9319821018857597, "language_loss": 0.70904201, "learning_rate": 2.6355190850661045e-07, "loss": 0.73096192, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.476008176803589 }, { "auxiliary_loss_clip": 0.01136832, "auxiliary_loss_mlp": 0.01027976, "balance_loss_clip": 1.04785824, "balance_loss_mlp": 1.02049005, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 1.5001502678213448, "language_loss": 0.86568415, "learning_rate": 2.631655360281486e-07, "loss": 0.88733226, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.5529427528381348 }, { "auxiliary_loss_clip": 0.01150487, "auxiliary_loss_mlp": 0.00760554, "balance_loss_clip": 1.04569066, "balance_loss_mlp": 1.00028276, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 1.8001417429298758, "language_loss": 0.65791178, "learning_rate": 2.6277942702609323e-07, "loss": 0.67702222, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.507512092590332 }, { "auxiliary_loss_clip": 0.01122638, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.04323506, "balance_loss_mlp": 1.01913953, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 2.2901404884610743, "language_loss": 0.87467116, "learning_rate": 2.623935815590186e-07, "loss": 0.89615798, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.560955047607422 }, { "auxiliary_loss_clip": 0.01137066, "auxiliary_loss_mlp": 0.01027455, "balance_loss_clip": 1.04619503, "balance_loss_mlp": 1.02033782, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 1.6815694427809096, "language_loss": 0.80693913, "learning_rate": 2.6200799968545516e-07, "loss": 0.82858431, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.537477970123291 }, { "auxiliary_loss_clip": 0.01031338, "auxiliary_loss_mlp": 0.01002311, "balance_loss_clip": 1.00913513, "balance_loss_mlp": 1.00133312, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 0.7852626339366039, "language_loss": 0.56480241, "learning_rate": 2.616226814638969e-07, "loss": 0.58513892, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 3.164156675338745 }, { "auxiliary_loss_clip": 0.0113507, "auxiliary_loss_mlp": 0.01024887, "balance_loss_clip": 1.04570425, "balance_loss_mlp": 1.0181576, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 1.8976502562124775, "language_loss": 0.77533293, "learning_rate": 2.612376269527954e-07, "loss": 0.79693246, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.537059783935547 }, { "auxiliary_loss_clip": 0.01135679, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.04640055, "balance_loss_mlp": 1.0184319, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 1.812978426605329, "language_loss": 0.67621547, "learning_rate": 2.608528362105635e-07, "loss": 0.69783068, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 3.2498533725738525 }, { "auxiliary_loss_clip": 0.0112281, "auxiliary_loss_mlp": 0.01022424, "balance_loss_clip": 1.04112363, "balance_loss_mlp": 1.01577485, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 1.7529014520591946, "language_loss": 0.72991693, "learning_rate": 2.6046830929557374e-07, "loss": 0.75136924, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.6162219047546387 }, { "auxiliary_loss_clip": 0.01117562, "auxiliary_loss_mlp": 0.01032111, "balance_loss_clip": 1.04320049, "balance_loss_mlp": 1.02544379, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 1.853753502122538, "language_loss": 0.84970617, "learning_rate": 2.6008404626615776e-07, "loss": 0.87120283, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.6020123958587646 }, { "auxiliary_loss_clip": 0.01152202, "auxiliary_loss_mlp": 0.01029416, "balance_loss_clip": 1.0466435, "balance_loss_mlp": 1.02219808, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 17.47952559738663, "language_loss": 0.74090397, "learning_rate": 2.597000471806092e-07, "loss": 0.76272011, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.483903169631958 }, { "auxiliary_loss_clip": 0.01132104, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.04742682, "balance_loss_mlp": 1.02279747, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 1.8195560310574774, "language_loss": 0.73008859, "learning_rate": 2.593163120971793e-07, "loss": 0.75170851, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.5459322929382324 }, { "auxiliary_loss_clip": 0.01099597, "auxiliary_loss_mlp": 0.01024029, "balance_loss_clip": 1.03830075, "balance_loss_mlp": 1.01717114, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 1.825510693413091, "language_loss": 0.69104493, "learning_rate": 2.5893284107408165e-07, "loss": 0.71228111, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 2.6083624362945557 }, { "auxiliary_loss_clip": 0.0110601, "auxiliary_loss_mlp": 0.01025883, "balance_loss_clip": 1.04232466, "balance_loss_mlp": 1.01879609, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 1.7384383751861623, "language_loss": 0.77665859, "learning_rate": 2.5854963416948726e-07, "loss": 0.79797745, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.657032012939453 }, { "auxiliary_loss_clip": 0.01103076, "auxiliary_loss_mlp": 0.01029306, "balance_loss_clip": 1.03765547, "balance_loss_mlp": 1.0225563, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 1.5539344652599512, "language_loss": 0.69218338, "learning_rate": 2.5816669144152816e-07, "loss": 0.71350718, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.64845871925354 }, { "auxiliary_loss_clip": 0.01054625, "auxiliary_loss_mlp": 0.01002603, "balance_loss_clip": 1.00730872, "balance_loss_mlp": 1.00153065, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8510225124165085, "language_loss": 0.66283178, "learning_rate": 2.5778401294829777e-07, "loss": 0.68340409, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.1462180614471436 }, { "auxiliary_loss_clip": 0.01146527, "auxiliary_loss_mlp": 0.00760768, "balance_loss_clip": 1.04504287, "balance_loss_mlp": 1.00027204, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 1.6032223285574714, "language_loss": 0.64799929, "learning_rate": 2.574015987478473e-07, "loss": 0.66707218, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 2.479705333709717 }, { "auxiliary_loss_clip": 0.01140776, "auxiliary_loss_mlp": 0.01029307, "balance_loss_clip": 1.04542947, "balance_loss_mlp": 1.02169847, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 1.979499922254669, "language_loss": 0.867064, "learning_rate": 2.570194488981887e-07, "loss": 0.8887648, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 4.164418697357178 }, { "auxiliary_loss_clip": 0.01055353, "auxiliary_loss_mlp": 0.01002059, "balance_loss_clip": 1.00797796, "balance_loss_mlp": 1.00099802, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8381690011365832, "language_loss": 0.60351789, "learning_rate": 2.566375634572939e-07, "loss": 0.62409204, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.010477066040039 }, { "auxiliary_loss_clip": 0.01124733, "auxiliary_loss_mlp": 0.01026018, "balance_loss_clip": 1.04122198, "balance_loss_mlp": 1.01847243, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 1.737000420887874, "language_loss": 0.75868994, "learning_rate": 2.562559424830943e-07, "loss": 0.78019744, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 3.2193753719329834 }, { "auxiliary_loss_clip": 0.01130812, "auxiliary_loss_mlp": 0.01027336, "balance_loss_clip": 1.04252434, "balance_loss_mlp": 1.02025175, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 3.9456816207422594, "language_loss": 0.70684135, "learning_rate": 2.5587458603348256e-07, "loss": 0.72842282, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 2.498051881790161 }, { "auxiliary_loss_clip": 0.0111562, "auxiliary_loss_mlp": 0.01028601, "balance_loss_clip": 1.04244077, "balance_loss_mlp": 1.02151728, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 1.6954764554503827, "language_loss": 0.83847845, "learning_rate": 2.554934941663085e-07, "loss": 0.85992062, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.5597798824310303 }, { "auxiliary_loss_clip": 0.01119595, "auxiliary_loss_mlp": 0.0102821, "balance_loss_clip": 1.04224598, "balance_loss_mlp": 1.02104819, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 2.554371009659951, "language_loss": 0.73230577, "learning_rate": 2.5511266693938484e-07, "loss": 0.75378382, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.5962836742401123 }, { "auxiliary_loss_clip": 0.01133269, "auxiliary_loss_mlp": 0.01031067, "balance_loss_clip": 1.04686713, "balance_loss_mlp": 1.02380085, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 4.548872087613753, "language_loss": 0.78021085, "learning_rate": 2.547321044104822e-07, "loss": 0.80185419, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.5657424926757812 }, { "auxiliary_loss_clip": 0.01166368, "auxiliary_loss_mlp": 0.01025259, "balance_loss_clip": 1.04693484, "balance_loss_mlp": 1.01806521, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.9297870857787616, "language_loss": 0.76751721, "learning_rate": 2.5435180663733113e-07, "loss": 0.78943348, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.4974164962768555 }, { "auxiliary_loss_clip": 0.01114552, "auxiliary_loss_mlp": 0.01026378, "balance_loss_clip": 1.04221153, "balance_loss_mlp": 1.01952338, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 2.239193693315536, "language_loss": 0.71564305, "learning_rate": 2.539717736776241e-07, "loss": 0.73705238, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.6135783195495605 }, { "auxiliary_loss_clip": 0.0114543, "auxiliary_loss_mlp": 0.01024085, "balance_loss_clip": 1.04641604, "balance_loss_mlp": 1.01737928, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 1.3747878315919682, "language_loss": 0.76398373, "learning_rate": 2.535920055890097e-07, "loss": 0.78567892, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.495159387588501 }, { "auxiliary_loss_clip": 0.01100293, "auxiliary_loss_mlp": 0.01031369, "balance_loss_clip": 1.0378902, "balance_loss_mlp": 1.02379322, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 6.839989645619045, "language_loss": 0.64721787, "learning_rate": 2.5321250242910006e-07, "loss": 0.66853452, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.570758581161499 }, { "auxiliary_loss_clip": 0.01166276, "auxiliary_loss_mlp": 0.01030225, "balance_loss_clip": 1.04984272, "balance_loss_mlp": 1.02300739, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 1.6143572718132144, "language_loss": 0.86411989, "learning_rate": 2.5283326425546493e-07, "loss": 0.88608491, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.4638795852661133 }, { "auxiliary_loss_clip": 0.01114978, "auxiliary_loss_mlp": 0.01024973, "balance_loss_clip": 1.04535556, "balance_loss_mlp": 1.01855075, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 1.8584907344756423, "language_loss": 0.69150162, "learning_rate": 2.5245429112563443e-07, "loss": 0.71290123, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.706101179122925 }, { "auxiliary_loss_clip": 0.01150682, "auxiliary_loss_mlp": 0.01023798, "balance_loss_clip": 1.0474236, "balance_loss_mlp": 1.01654673, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 1.731550835977976, "language_loss": 0.81836396, "learning_rate": 2.5207558309709865e-07, "loss": 0.84010875, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.5519516468048096 }, { "auxiliary_loss_clip": 0.01029451, "auxiliary_loss_mlp": 0.00751231, "balance_loss_clip": 1.00794518, "balance_loss_mlp": 1.00008631, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.6555151056694961, "language_loss": 0.56296849, "learning_rate": 2.516971402273065e-07, "loss": 0.58077532, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 3.1411688327789307 }, { "auxiliary_loss_clip": 0.01134718, "auxiliary_loss_mlp": 0.01023173, "balance_loss_clip": 1.04217124, "balance_loss_mlp": 1.01653337, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 1.8006871294143818, "language_loss": 0.67680132, "learning_rate": 2.513189625736687e-07, "loss": 0.69838023, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 3.345205783843994 }, { "auxiliary_loss_clip": 0.01126466, "auxiliary_loss_mlp": 0.01029034, "balance_loss_clip": 1.04349232, "balance_loss_mlp": 1.02222729, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 2.1454323822711396, "language_loss": 0.71690416, "learning_rate": 2.509410501935534e-07, "loss": 0.73845911, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.5729012489318848 }, { "auxiliary_loss_clip": 0.01138581, "auxiliary_loss_mlp": 0.01030538, "balance_loss_clip": 1.04524183, "balance_loss_mlp": 1.02317429, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 2.1114331325008644, "language_loss": 0.75183892, "learning_rate": 2.5056340314429116e-07, "loss": 0.77353007, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.506105661392212 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01027024, "balance_loss_clip": 1.04052138, "balance_loss_mlp": 1.01945376, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 2.3062927031070513, "language_loss": 0.80303901, "learning_rate": 2.5018602148316904e-07, "loss": 0.82440531, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.5797815322875977 }, { "auxiliary_loss_clip": 0.01163124, "auxiliary_loss_mlp": 0.01026754, "balance_loss_clip": 1.04898906, "balance_loss_mlp": 1.02003682, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 1.6929395366675075, "language_loss": 0.80287713, "learning_rate": 2.498089052674359e-07, "loss": 0.82477587, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.48478627204895 }, { "auxiliary_loss_clip": 0.01153389, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.04929554, "balance_loss_mlp": 1.02255869, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 2.0045510755493563, "language_loss": 0.74957693, "learning_rate": 2.494320545543007e-07, "loss": 0.77140731, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.4795982837677 }, { "auxiliary_loss_clip": 0.01165762, "auxiliary_loss_mlp": 0.0102778, "balance_loss_clip": 1.04704046, "balance_loss_mlp": 1.02024007, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 1.6386687250603638, "language_loss": 0.6634106, "learning_rate": 2.490554694009308e-07, "loss": 0.68534601, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 2.4592936038970947 }, { "auxiliary_loss_clip": 0.01152867, "auxiliary_loss_mlp": 0.0102132, "balance_loss_clip": 1.04588926, "balance_loss_mlp": 1.01412022, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.587720690628415, "language_loss": 0.78538877, "learning_rate": 2.4867914986445426e-07, "loss": 0.80713063, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 2.609590530395508 }, { "auxiliary_loss_clip": 0.01137732, "auxiliary_loss_mlp": 0.01026439, "balance_loss_clip": 1.04227877, "balance_loss_mlp": 1.02004385, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 1.8194855038893754, "language_loss": 0.71578121, "learning_rate": 2.483030960019581e-07, "loss": 0.73742294, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.7668371200561523 }, { "auxiliary_loss_clip": 0.01013014, "auxiliary_loss_mlp": 0.01001497, "balance_loss_clip": 1.00719965, "balance_loss_mlp": 1.00043571, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7280118113912966, "language_loss": 0.55500335, "learning_rate": 2.479273078704891e-07, "loss": 0.57514846, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 3.0091805458068848 }, { "auxiliary_loss_clip": 0.01010277, "auxiliary_loss_mlp": 0.01001638, "balance_loss_clip": 1.01106954, "balance_loss_mlp": 1.00061846, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.8348432571624139, "language_loss": 0.64742815, "learning_rate": 2.475517855270552e-07, "loss": 0.66754723, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 3.148796319961548 }, { "auxiliary_loss_clip": 0.01162287, "auxiliary_loss_mlp": 0.01020633, "balance_loss_clip": 1.04814935, "balance_loss_mlp": 1.01378798, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 2.0738376566496264, "language_loss": 0.7261945, "learning_rate": 2.4717652902862143e-07, "loss": 0.74802375, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 4.201322555541992 }, { "auxiliary_loss_clip": 0.01136213, "auxiliary_loss_mlp": 0.01025281, "balance_loss_clip": 1.04452872, "balance_loss_mlp": 1.01866794, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 1.7041507385838286, "language_loss": 0.81385761, "learning_rate": 2.4680153843211495e-07, "loss": 0.83547258, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.564302921295166 }, { "auxiliary_loss_clip": 0.01138272, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.04995883, "balance_loss_mlp": 1.01816821, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 2.0057053338902007, "language_loss": 0.72523695, "learning_rate": 2.464268137944212e-07, "loss": 0.74687374, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 3.2892391681671143 }, { "auxiliary_loss_clip": 0.01097653, "auxiliary_loss_mlp": 0.0103043, "balance_loss_clip": 1.04070318, "balance_loss_mlp": 1.02284515, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 1.9288888085736424, "language_loss": 0.77964038, "learning_rate": 2.46052355172385e-07, "loss": 0.8009212, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 2.6318960189819336 }, { "auxiliary_loss_clip": 0.01165006, "auxiliary_loss_mlp": 0.01023845, "balance_loss_clip": 1.04849279, "balance_loss_mlp": 1.01617408, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 1.7383339214581182, "language_loss": 0.74746686, "learning_rate": 2.456781626228128e-07, "loss": 0.7693553, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 2.4987714290618896 }, { "auxiliary_loss_clip": 0.01012842, "auxiliary_loss_mlp": 0.00751457, "balance_loss_clip": 1.00582576, "balance_loss_mlp": 1.00016582, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 0.9131311815935615, "language_loss": 0.66400921, "learning_rate": 2.453042362024675e-07, "loss": 0.68165219, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.1902053356170654 }, { "auxiliary_loss_clip": 0.01161224, "auxiliary_loss_mlp": 0.01025387, "balance_loss_clip": 1.04605722, "balance_loss_mlp": 1.0185684, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 1.4084345818115724, "language_loss": 0.73261273, "learning_rate": 2.449305759680751e-07, "loss": 0.75447881, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.5161941051483154 }, { "auxiliary_loss_clip": 0.01121454, "auxiliary_loss_mlp": 0.0102942, "balance_loss_clip": 1.04635882, "balance_loss_mlp": 1.02243781, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.4357030782437141, "language_loss": 0.74947762, "learning_rate": 2.445571819763188e-07, "loss": 0.77098644, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.61283802986145 }, { "auxiliary_loss_clip": 0.01162799, "auxiliary_loss_mlp": 0.01021848, "balance_loss_clip": 1.04870331, "balance_loss_mlp": 1.01515985, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 1.5358165758947169, "language_loss": 0.58248508, "learning_rate": 2.4418405428384227e-07, "loss": 0.60433155, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.4701952934265137 }, { "auxiliary_loss_clip": 0.01162756, "auxiliary_loss_mlp": 0.00761086, "balance_loss_clip": 1.04741859, "balance_loss_mlp": 1.00031829, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 2.045551206684969, "language_loss": 0.7159878, "learning_rate": 2.4381119294724864e-07, "loss": 0.73522627, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.443713426589966 }, { "auxiliary_loss_clip": 0.01162138, "auxiliary_loss_mlp": 0.01030113, "balance_loss_clip": 1.04654229, "balance_loss_mlp": 1.02339518, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 2.1927478290687445, "language_loss": 0.5371564, "learning_rate": 2.434385980231004e-07, "loss": 0.55907887, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 2.434847116470337 }, { "auxiliary_loss_clip": 0.01148984, "auxiliary_loss_mlp": 0.01025758, "balance_loss_clip": 1.04554188, "balance_loss_mlp": 1.01894808, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.4537488929315034, "language_loss": 0.65519053, "learning_rate": 2.4306626956792043e-07, "loss": 0.67693794, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 2.768519639968872 }, { "auxiliary_loss_clip": 0.01148924, "auxiliary_loss_mlp": 0.01026475, "balance_loss_clip": 1.04461312, "balance_loss_mlp": 1.01970053, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 1.659560725816226, "language_loss": 0.75380051, "learning_rate": 2.4269420763819017e-07, "loss": 0.77555448, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.4677011966705322 }, { "auxiliary_loss_clip": 0.01147479, "auxiliary_loss_mlp": 0.01023134, "balance_loss_clip": 1.04598999, "balance_loss_mlp": 1.0165863, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 3.2460956279533297, "language_loss": 0.83223504, "learning_rate": 2.4232241229035223e-07, "loss": 0.8539412, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.499114990234375 }, { "auxiliary_loss_clip": 0.01045384, "auxiliary_loss_mlp": 0.01001906, "balance_loss_clip": 1.00716627, "balance_loss_mlp": 1.00082719, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.7534895916985822, "language_loss": 0.5677762, "learning_rate": 2.419508835808064e-07, "loss": 0.58824909, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 3.0388760566711426 }, { "auxiliary_loss_clip": 0.01135424, "auxiliary_loss_mlp": 0.01026383, "balance_loss_clip": 1.04432189, "balance_loss_mlp": 1.01904607, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 2.1848797883762376, "language_loss": 0.62629986, "learning_rate": 2.415796215659134e-07, "loss": 0.64791787, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 3.2734925746917725 }, { "auxiliary_loss_clip": 0.01122624, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.0389502, "balance_loss_mlp": 1.01997137, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 1.9983476178858195, "language_loss": 0.77301615, "learning_rate": 2.412086263019939e-07, "loss": 0.79451561, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.5249366760253906 }, { "auxiliary_loss_clip": 0.01160432, "auxiliary_loss_mlp": 0.01022487, "balance_loss_clip": 1.04791713, "balance_loss_mlp": 1.01578152, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 1.669404455405343, "language_loss": 0.7988373, "learning_rate": 2.408378978453276e-07, "loss": 0.82066655, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.4815549850463867 }, { "auxiliary_loss_clip": 0.01046007, "auxiliary_loss_mlp": 0.01000569, "balance_loss_clip": 1.00787294, "balance_loss_mlp": 0.99939519, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.8087358708888047, "language_loss": 0.63986182, "learning_rate": 2.404674362521533e-07, "loss": 0.66032761, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 2.985466957092285 }, { "auxiliary_loss_clip": 0.01150483, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.04832315, "balance_loss_mlp": 1.02287388, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 2.256584762813342, "language_loss": 0.74957973, "learning_rate": 2.4009724157866997e-07, "loss": 0.77138114, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.480224370956421 }, { "auxiliary_loss_clip": 0.01161367, "auxiliary_loss_mlp": 0.01022086, "balance_loss_clip": 1.04672194, "balance_loss_mlp": 1.01561034, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 1.7770821919531326, "language_loss": 0.76037455, "learning_rate": 2.3972731388103564e-07, "loss": 0.78220916, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.462667465209961 }, { "auxiliary_loss_clip": 0.00999339, "auxiliary_loss_mlp": 0.01000575, "balance_loss_clip": 1.00755239, "balance_loss_mlp": 0.9994548, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 0.7982031210553313, "language_loss": 0.62391883, "learning_rate": 2.393576532153687e-07, "loss": 0.64391798, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 3.3935883045196533 }, { "auxiliary_loss_clip": 0.01043749, "auxiliary_loss_mlp": 0.01002015, "balance_loss_clip": 1.00769734, "balance_loss_mlp": 1.0008173, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 0.9258930230809921, "language_loss": 0.57809734, "learning_rate": 2.389882596377453e-07, "loss": 0.59855509, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 3.1417064666748047 }, { "auxiliary_loss_clip": 0.01160985, "auxiliary_loss_mlp": 0.01032963, "balance_loss_clip": 1.04558516, "balance_loss_mlp": 1.02586985, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 3.301794662411009, "language_loss": 0.7636615, "learning_rate": 2.386191332042031e-07, "loss": 0.78560102, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.6277542114257812 }, { "auxiliary_loss_clip": 0.01166713, "auxiliary_loss_mlp": 0.0103711, "balance_loss_clip": 1.04808497, "balance_loss_mlp": 1.02935863, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 1.6855541986123743, "language_loss": 0.7289989, "learning_rate": 2.3825027397073794e-07, "loss": 0.75103712, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 2.483851909637451 }, { "auxiliary_loss_clip": 0.01147056, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.04834414, "balance_loss_mlp": 1.02319181, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 1.972068967879117, "language_loss": 0.67121673, "learning_rate": 2.3788168199330515e-07, "loss": 0.69298625, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 2.540411949157715 }, { "auxiliary_loss_clip": 0.01120541, "auxiliary_loss_mlp": 0.01025456, "balance_loss_clip": 1.03946936, "balance_loss_mlp": 1.01873851, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 1.521415416204912, "language_loss": 0.72545052, "learning_rate": 2.3751335732782074e-07, "loss": 0.74691051, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 4.297223806381226 }, { "auxiliary_loss_clip": 0.01147807, "auxiliary_loss_mlp": 0.01033388, "balance_loss_clip": 1.0464921, "balance_loss_mlp": 1.02627754, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 1.7981743767137541, "language_loss": 0.7978012, "learning_rate": 2.371453000301582e-07, "loss": 0.8196131, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.476506233215332 }, { "auxiliary_loss_clip": 0.01116333, "auxiliary_loss_mlp": 0.01023961, "balance_loss_clip": 1.0426774, "balance_loss_mlp": 1.01759243, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 1.8707400271106658, "language_loss": 0.74233961, "learning_rate": 2.3677751015615222e-07, "loss": 0.76374257, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 3.3688371181488037 }, { "auxiliary_loss_clip": 0.01123615, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.04208183, "balance_loss_mlp": 1.02358425, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 1.729974346393647, "language_loss": 0.85372764, "learning_rate": 2.3640998776159593e-07, "loss": 0.87527287, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 2.5133745670318604 }, { "auxiliary_loss_clip": 0.0113756, "auxiliary_loss_mlp": 0.01027302, "balance_loss_clip": 1.04639328, "balance_loss_mlp": 1.02125764, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.7749461873710313, "language_loss": 0.80992484, "learning_rate": 2.3604273290224253e-07, "loss": 0.83157349, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.536184549331665 }, { "auxiliary_loss_clip": 0.01137878, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.04687786, "balance_loss_mlp": 1.02319288, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 1.8422821079901068, "language_loss": 0.74555367, "learning_rate": 2.356757456338039e-07, "loss": 0.76724154, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.5079658031463623 }, { "auxiliary_loss_clip": 0.01035205, "auxiliary_loss_mlp": 0.01000904, "balance_loss_clip": 1.01215971, "balance_loss_mlp": 0.99990863, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 0.7519293817279145, "language_loss": 0.59058273, "learning_rate": 2.3530902601195147e-07, "loss": 0.61094391, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 3.2415740489959717 }, { "auxiliary_loss_clip": 0.01148962, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.04750443, "balance_loss_mlp": 1.02274418, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 2.2207044979595563, "language_loss": 0.78925228, "learning_rate": 2.34942574092317e-07, "loss": 0.81105775, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.4910688400268555 }, { "auxiliary_loss_clip": 0.01155461, "auxiliary_loss_mlp": 0.01029793, "balance_loss_clip": 1.04897904, "balance_loss_mlp": 1.02297425, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 2.121203528887361, "language_loss": 0.7660414, "learning_rate": 2.3457638993049045e-07, "loss": 0.78789395, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.5215368270874023 }, { "auxiliary_loss_clip": 0.01095489, "auxiliary_loss_mlp": 0.01027832, "balance_loss_clip": 1.04227948, "balance_loss_mlp": 1.02031589, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 1.9749433250007153, "language_loss": 0.64293337, "learning_rate": 2.3421047358202252e-07, "loss": 0.66416663, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.6150450706481934 }, { "auxiliary_loss_clip": 0.01154046, "auxiliary_loss_mlp": 0.01027205, "balance_loss_clip": 1.04828858, "balance_loss_mlp": 1.01988888, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 2.242554804561456, "language_loss": 0.83037889, "learning_rate": 2.3384482510242144e-07, "loss": 0.85219139, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.5155465602874756 }, { "auxiliary_loss_clip": 0.0116319, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.04504943, "balance_loss_mlp": 1.02116513, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 2.167255765375438, "language_loss": 0.77254295, "learning_rate": 2.3347944454715575e-07, "loss": 0.79446113, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.466905117034912 }, { "auxiliary_loss_clip": 0.0116379, "auxiliary_loss_mlp": 0.01027141, "balance_loss_clip": 1.04659986, "balance_loss_mlp": 1.01984859, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 1.8733654165639655, "language_loss": 0.66928327, "learning_rate": 2.331143319716542e-07, "loss": 0.69119257, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.513002395629883 }, { "auxiliary_loss_clip": 0.01120894, "auxiliary_loss_mlp": 0.01021942, "balance_loss_clip": 1.04270148, "balance_loss_mlp": 1.01462865, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 2.2038569048485135, "language_loss": 0.66241032, "learning_rate": 2.3274948743130363e-07, "loss": 0.68383873, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.6240925788879395 }, { "auxiliary_loss_clip": 0.01161425, "auxiliary_loss_mlp": 0.0102316, "balance_loss_clip": 1.04571021, "balance_loss_mlp": 1.01671994, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.5622522385818285, "language_loss": 0.79262614, "learning_rate": 2.3238491098145085e-07, "loss": 0.81447196, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 2.478912591934204 }, { "auxiliary_loss_clip": 0.01152458, "auxiliary_loss_mlp": 0.01027923, "balance_loss_clip": 1.04833734, "balance_loss_mlp": 1.0211103, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 2.148362165666817, "language_loss": 0.73359048, "learning_rate": 2.3202060267740141e-07, "loss": 0.75539428, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 3.5590946674346924 }, { "auxiliary_loss_clip": 0.0110167, "auxiliary_loss_mlp": 0.01023786, "balance_loss_clip": 1.03814054, "balance_loss_mlp": 1.01677692, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.3594405136862386, "language_loss": 0.76918495, "learning_rate": 2.3165656257442044e-07, "loss": 0.79043955, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.6745221614837646 }, { "auxiliary_loss_clip": 0.01146005, "auxiliary_loss_mlp": 0.01027064, "balance_loss_clip": 1.04670024, "balance_loss_mlp": 1.02051377, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 1.8208785741318407, "language_loss": 0.90143144, "learning_rate": 2.31292790727734e-07, "loss": 0.9231621, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.496387243270874 }, { "auxiliary_loss_clip": 0.01159175, "auxiliary_loss_mlp": 0.01024799, "balance_loss_clip": 1.04519796, "balance_loss_mlp": 1.01798582, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 2.5900098484439718, "language_loss": 0.79984736, "learning_rate": 2.3092928719252392e-07, "loss": 0.8216871, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.464017629623413 }, { "auxiliary_loss_clip": 0.0114647, "auxiliary_loss_mlp": 0.0102457, "balance_loss_clip": 1.04431915, "balance_loss_mlp": 1.0178194, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 2.0437908963875056, "language_loss": 0.78396118, "learning_rate": 2.3056605202393475e-07, "loss": 0.80567157, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.483010768890381 }, { "auxiliary_loss_clip": 0.01142267, "auxiliary_loss_mlp": 0.00761239, "balance_loss_clip": 1.04024696, "balance_loss_mlp": 1.00029922, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 2.5387385051966485, "language_loss": 0.66886067, "learning_rate": 2.3020308527706888e-07, "loss": 0.68789577, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.516066551208496 }, { "auxiliary_loss_clip": 0.01138337, "auxiliary_loss_mlp": 0.01024961, "balance_loss_clip": 1.04293633, "balance_loss_mlp": 1.01808906, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.9609204586111524, "language_loss": 0.89144981, "learning_rate": 2.2984038700698715e-07, "loss": 0.91308278, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 2.579803705215454 }, { "auxiliary_loss_clip": 0.01147215, "auxiliary_loss_mlp": 0.01026394, "balance_loss_clip": 1.04654384, "balance_loss_mlp": 1.01992428, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 1.8273253395274969, "language_loss": 0.78899539, "learning_rate": 2.2947795726871222e-07, "loss": 0.81073141, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 2.5389277935028076 }, { "auxiliary_loss_clip": 0.01148552, "auxiliary_loss_mlp": 0.00760953, "balance_loss_clip": 1.04927707, "balance_loss_mlp": 1.00033545, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 1.7106387109087806, "language_loss": 0.85518283, "learning_rate": 2.2911579611722253e-07, "loss": 0.87427789, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.486285448074341 }, { "auxiliary_loss_clip": 0.01129282, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.04268467, "balance_loss_mlp": 1.02067518, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 2.2527955360846206, "language_loss": 0.87279129, "learning_rate": 2.2875390360745905e-07, "loss": 0.89435518, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.512643575668335 }, { "auxiliary_loss_clip": 0.01124152, "auxiliary_loss_mlp": 0.01022839, "balance_loss_clip": 1.04208386, "balance_loss_mlp": 1.01567459, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 1.6312882088282665, "language_loss": 0.77329135, "learning_rate": 2.2839227979432008e-07, "loss": 0.7947613, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 2.531562328338623 }, { "auxiliary_loss_clip": 0.01138474, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 1.04448974, "balance_loss_mlp": 1.01831222, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 1.693106886776018, "language_loss": 0.84886175, "learning_rate": 2.2803092473266373e-07, "loss": 0.8705017, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 4.158874988555908 }, { "auxiliary_loss_clip": 0.01165967, "auxiliary_loss_mlp": 0.01030359, "balance_loss_clip": 1.04950404, "balance_loss_mlp": 1.02342975, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 2.332604473455492, "language_loss": 0.86490464, "learning_rate": 2.2766983847730724e-07, "loss": 0.88686794, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.4785001277923584 }, { "auxiliary_loss_clip": 0.01130147, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.04254985, "balance_loss_mlp": 1.02457345, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 2.3082065367252187, "language_loss": 0.66663706, "learning_rate": 2.2730902108302663e-07, "loss": 0.68825817, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 3.2927205562591553 }, { "auxiliary_loss_clip": 0.01127792, "auxiliary_loss_mlp": 0.01026088, "balance_loss_clip": 1.04105759, "balance_loss_mlp": 1.0186646, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 1.6027284782879994, "language_loss": 0.68491662, "learning_rate": 2.269484726045583e-07, "loss": 0.70645547, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 2.5224485397338867 }, { "auxiliary_loss_clip": 0.01128441, "auxiliary_loss_mlp": 0.01023868, "balance_loss_clip": 1.04659975, "balance_loss_mlp": 1.01729345, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 1.6613215328222168, "language_loss": 0.78881681, "learning_rate": 2.2658819309659672e-07, "loss": 0.81033993, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 2.6303181648254395 }, { "auxiliary_loss_clip": 0.01132763, "auxiliary_loss_mlp": 0.01023381, "balance_loss_clip": 1.0466038, "balance_loss_mlp": 1.01682174, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 1.8145613117354946, "language_loss": 0.84845209, "learning_rate": 2.2622818261379706e-07, "loss": 0.87001348, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.5482494831085205 }, { "auxiliary_loss_clip": 0.01133199, "auxiliary_loss_mlp": 0.01027624, "balance_loss_clip": 1.04413331, "balance_loss_mlp": 1.01994705, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 1.7940183998586234, "language_loss": 0.74710262, "learning_rate": 2.2586844121077142e-07, "loss": 0.76871085, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 2.521526575088501 }, { "auxiliary_loss_clip": 0.01104388, "auxiliary_loss_mlp": 0.01025956, "balance_loss_clip": 1.03960371, "balance_loss_mlp": 1.01886296, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 2.0112045670666387, "language_loss": 0.71774584, "learning_rate": 2.2550896894209215e-07, "loss": 0.73904932, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.6175196170806885 }, { "auxiliary_loss_clip": 0.01008525, "auxiliary_loss_mlp": 0.01001505, "balance_loss_clip": 1.01063967, "balance_loss_mlp": 1.00031304, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.6779881122009657, "language_loss": 0.56654203, "learning_rate": 2.2514976586229184e-07, "loss": 0.58664232, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.3455471992492676 }, { "auxiliary_loss_clip": 0.0104627, "auxiliary_loss_mlp": 0.01002574, "balance_loss_clip": 1.00864959, "balance_loss_mlp": 1.00147772, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.7508151229877704, "language_loss": 0.54663694, "learning_rate": 2.247908320258609e-07, "loss": 0.56712544, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.085184335708618 }, { "auxiliary_loss_clip": 0.01099618, "auxiliary_loss_mlp": 0.01024735, "balance_loss_clip": 1.04253721, "balance_loss_mlp": 1.01709962, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.000702376488285, "language_loss": 0.79355013, "learning_rate": 2.2443216748724914e-07, "loss": 0.81479371, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.6145870685577393 }, { "auxiliary_loss_clip": 0.01152943, "auxiliary_loss_mlp": 0.00760592, "balance_loss_clip": 1.04770005, "balance_loss_mlp": 1.00026321, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 2.246860344815966, "language_loss": 0.74306846, "learning_rate": 2.2407377230086588e-07, "loss": 0.76220381, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.584655523300171 }, { "auxiliary_loss_clip": 0.01118506, "auxiliary_loss_mlp": 0.01031128, "balance_loss_clip": 1.04352224, "balance_loss_mlp": 1.0240972, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 1.9773833379981034, "language_loss": 0.8327986, "learning_rate": 2.23715646521079e-07, "loss": 0.85429502, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.5695416927337646 }, { "auxiliary_loss_clip": 0.01154161, "auxiliary_loss_mlp": 0.00761258, "balance_loss_clip": 1.0465126, "balance_loss_mlp": 1.00030243, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 1.946998432856179, "language_loss": 0.83912277, "learning_rate": 2.2335779020221724e-07, "loss": 0.85827696, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.4974160194396973 }, { "auxiliary_loss_clip": 0.01038967, "auxiliary_loss_mlp": 0.01000296, "balance_loss_clip": 1.00859499, "balance_loss_mlp": 0.99926513, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.8072700180855711, "language_loss": 0.56415737, "learning_rate": 2.2300020339856497e-07, "loss": 0.58455002, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.138139009475708 }, { "auxiliary_loss_clip": 0.01134368, "auxiliary_loss_mlp": 0.01023438, "balance_loss_clip": 1.04419136, "balance_loss_mlp": 1.01691103, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 2.2935103499693184, "language_loss": 0.78152347, "learning_rate": 2.2264288616436966e-07, "loss": 0.8031016, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 3.5478947162628174 }, { "auxiliary_loss_clip": 0.01130921, "auxiliary_loss_mlp": 0.01027187, "balance_loss_clip": 1.04675484, "balance_loss_mlp": 1.02034163, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 1.8855149353758696, "language_loss": 0.72376919, "learning_rate": 2.222858385538351e-07, "loss": 0.74535024, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.50673508644104 }, { "auxiliary_loss_clip": 0.01147935, "auxiliary_loss_mlp": 0.01028807, "balance_loss_clip": 1.04537678, "balance_loss_mlp": 1.02173185, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 1.6001851121193664, "language_loss": 0.6819452, "learning_rate": 2.2192906062112527e-07, "loss": 0.70371258, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.501384735107422 }, { "auxiliary_loss_clip": 0.01161928, "auxiliary_loss_mlp": 0.01025397, "balance_loss_clip": 1.04602659, "balance_loss_mlp": 1.01860821, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 1.5481216993896698, "language_loss": 0.70452368, "learning_rate": 2.2157255242036377e-07, "loss": 0.72639692, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.607110023498535 }, { "auxiliary_loss_clip": 0.0111865, "auxiliary_loss_mlp": 0.01026879, "balance_loss_clip": 1.04224229, "balance_loss_mlp": 1.02019143, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 1.6059794880448326, "language_loss": 0.73915172, "learning_rate": 2.2121631400563135e-07, "loss": 0.760607, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.5775718688964844 }, { "auxiliary_loss_clip": 0.01044558, "auxiliary_loss_mlp": 0.01001481, "balance_loss_clip": 1.0115453, "balance_loss_mlp": 1.00049198, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7783484311923864, "language_loss": 0.53025484, "learning_rate": 2.208603454309701e-07, "loss": 0.55071527, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.0725367069244385 }, { "auxiliary_loss_clip": 0.01106779, "auxiliary_loss_mlp": 0.01026255, "balance_loss_clip": 1.04305255, "balance_loss_mlp": 1.0191704, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 1.8889054053657617, "language_loss": 0.70997918, "learning_rate": 2.2050464675037994e-07, "loss": 0.73130953, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 2.6063196659088135 }, { "auxiliary_loss_clip": 0.01135709, "auxiliary_loss_mlp": 0.01028817, "balance_loss_clip": 1.04454231, "balance_loss_mlp": 1.02163458, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 2.0834455857019663, "language_loss": 0.72423565, "learning_rate": 2.2014921801782016e-07, "loss": 0.74588084, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 2.5610036849975586 }, { "auxiliary_loss_clip": 0.01135974, "auxiliary_loss_mlp": 0.01024929, "balance_loss_clip": 1.04060292, "balance_loss_mlp": 1.01802945, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 1.8246362742583777, "language_loss": 0.73818231, "learning_rate": 2.1979405928720872e-07, "loss": 0.75979137, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.564816951751709 }, { "auxiliary_loss_clip": 0.01137723, "auxiliary_loss_mlp": 0.0102667, "balance_loss_clip": 1.04341316, "balance_loss_mlp": 1.02005386, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.409318191139175, "language_loss": 0.792858, "learning_rate": 2.1943917061242257e-07, "loss": 0.81450194, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.5368049144744873 }, { "auxiliary_loss_clip": 0.01156628, "auxiliary_loss_mlp": 0.00761409, "balance_loss_clip": 1.04778934, "balance_loss_mlp": 1.00027108, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.5657498606566191, "language_loss": 0.66541851, "learning_rate": 2.1908455204729903e-07, "loss": 0.68459892, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 3.312654495239258 }, { "auxiliary_loss_clip": 0.01134026, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 1.04283011, "balance_loss_mlp": 1.01808095, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 1.8388413769805714, "language_loss": 0.7832495, "learning_rate": 2.1873020364563265e-07, "loss": 0.80483973, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 3.403931140899658 }, { "auxiliary_loss_clip": 0.01145734, "auxiliary_loss_mlp": 0.01018123, "balance_loss_clip": 1.04601431, "balance_loss_mlp": 1.01103604, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 2.1154769325732605, "language_loss": 0.75910103, "learning_rate": 2.183761254611789e-07, "loss": 0.78073955, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.5172483921051025 }, { "auxiliary_loss_clip": 0.0114899, "auxiliary_loss_mlp": 0.01024504, "balance_loss_clip": 1.04837799, "balance_loss_mlp": 1.0179565, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 2.0624387480063238, "language_loss": 0.70188475, "learning_rate": 2.1802231754764987e-07, "loss": 0.7236197, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 3.4550530910491943 }, { "auxiliary_loss_clip": 0.01137044, "auxiliary_loss_mlp": 0.01026885, "balance_loss_clip": 1.04333031, "balance_loss_mlp": 1.01915431, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 1.8284435046589689, "language_loss": 0.76550949, "learning_rate": 2.17668779958718e-07, "loss": 0.78714871, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.5585248470306396 }, { "auxiliary_loss_clip": 0.01164182, "auxiliary_loss_mlp": 0.01027634, "balance_loss_clip": 1.04839826, "balance_loss_mlp": 1.02042508, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 2.1980495197941514, "language_loss": 0.80796874, "learning_rate": 2.1731551274801553e-07, "loss": 0.82988691, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 2.4387567043304443 }, { "auxiliary_loss_clip": 0.01134685, "auxiliary_loss_mlp": 0.01026142, "balance_loss_clip": 1.04409838, "balance_loss_mlp": 1.01890635, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 2.0299040063634495, "language_loss": 0.61442196, "learning_rate": 2.169625159691324e-07, "loss": 0.63603026, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.543661117553711 }, { "auxiliary_loss_clip": 0.01118034, "auxiliary_loss_mlp": 0.01025039, "balance_loss_clip": 1.04342449, "balance_loss_mlp": 1.01798165, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 2.228441603914061, "language_loss": 0.73923558, "learning_rate": 2.1660978967561784e-07, "loss": 0.76066631, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 2.6260414123535156 }, { "auxiliary_loss_clip": 0.01158299, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.04264927, "balance_loss_mlp": 1.01943183, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 2.445236501993734, "language_loss": 0.79220951, "learning_rate": 2.1625733392098035e-07, "loss": 0.81405723, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.4345145225524902 }, { "auxiliary_loss_clip": 0.01161436, "auxiliary_loss_mlp": 0.01025306, "balance_loss_clip": 1.04635286, "balance_loss_mlp": 1.01879692, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 1.692352145656544, "language_loss": 0.79617375, "learning_rate": 2.159051487586867e-07, "loss": 0.81804121, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.478616714477539 }, { "auxiliary_loss_clip": 0.01143697, "auxiliary_loss_mlp": 0.01026497, "balance_loss_clip": 1.04888487, "balance_loss_mlp": 1.01922226, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 2.5653553296721414, "language_loss": 0.72672504, "learning_rate": 2.155532342421642e-07, "loss": 0.74842697, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.560652494430542 }, { "auxiliary_loss_clip": 0.01150284, "auxiliary_loss_mlp": 0.01026434, "balance_loss_clip": 1.04474068, "balance_loss_mlp": 1.01908803, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.8298395763940394, "language_loss": 0.78571332, "learning_rate": 2.1520159042479636e-07, "loss": 0.80748045, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.5248165130615234 }, { "auxiliary_loss_clip": 0.01147671, "auxiliary_loss_mlp": 0.01024466, "balance_loss_clip": 1.04663479, "balance_loss_mlp": 1.01787663, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 1.9894412486379829, "language_loss": 0.71075153, "learning_rate": 2.148502173599287e-07, "loss": 0.7324729, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.503873348236084 }, { "auxiliary_loss_clip": 0.01129259, "auxiliary_loss_mlp": 0.01026942, "balance_loss_clip": 1.04357839, "balance_loss_mlp": 1.01964986, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 2.130948885000986, "language_loss": 0.66043174, "learning_rate": 2.1449911510086372e-07, "loss": 0.68199378, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.5927248001098633 }, { "auxiliary_loss_clip": 0.0114727, "auxiliary_loss_mlp": 0.01024881, "balance_loss_clip": 1.04553294, "balance_loss_mlp": 1.01835763, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 1.9316531468792806, "language_loss": 0.76900971, "learning_rate": 2.141482837008628e-07, "loss": 0.79073119, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.519806385040283 }, { "auxiliary_loss_clip": 0.01142201, "auxiliary_loss_mlp": 0.01025752, "balance_loss_clip": 1.04438066, "balance_loss_mlp": 1.01830101, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 1.865499358594141, "language_loss": 0.71863925, "learning_rate": 2.1379772321314826e-07, "loss": 0.74031878, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.4904842376708984 }, { "auxiliary_loss_clip": 0.01084942, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 1.03977692, "balance_loss_mlp": 1.02137613, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 2.074005724409962, "language_loss": 0.8119117, "learning_rate": 2.1344743369089802e-07, "loss": 0.83304417, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 3.3647048473358154 }, { "auxiliary_loss_clip": 0.01133237, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.04423213, "balance_loss_mlp": 1.02222908, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.5915827003851792, "language_loss": 0.82098871, "learning_rate": 2.130974151872522e-07, "loss": 0.84261298, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.5654871463775635 }, { "auxiliary_loss_clip": 0.01119248, "auxiliary_loss_mlp": 0.01023563, "balance_loss_clip": 1.04499924, "balance_loss_mlp": 1.01620793, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 2.040549387611809, "language_loss": 0.78584158, "learning_rate": 2.1274766775530773e-07, "loss": 0.80726969, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.5666699409484863 }, { "auxiliary_loss_clip": 0.01165694, "auxiliary_loss_mlp": 0.01025501, "balance_loss_clip": 1.04737842, "balance_loss_mlp": 1.01837826, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 2.0914246848158475, "language_loss": 0.79474115, "learning_rate": 2.1239819144812077e-07, "loss": 0.81665313, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.4316513538360596 }, { "auxiliary_loss_clip": 0.01114485, "auxiliary_loss_mlp": 0.01031054, "balance_loss_clip": 1.0404768, "balance_loss_mlp": 1.02385962, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 1.5894299410531016, "language_loss": 0.69870245, "learning_rate": 2.1204898631870716e-07, "loss": 0.72015786, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.746953248977661 }, { "auxiliary_loss_clip": 0.01139052, "auxiliary_loss_mlp": 0.01021725, "balance_loss_clip": 1.04929316, "balance_loss_mlp": 1.01527548, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 1.928569654537518, "language_loss": 0.75881845, "learning_rate": 2.1170005242004006e-07, "loss": 0.78042614, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.594882011413574 }, { "auxiliary_loss_clip": 0.01135234, "auxiliary_loss_mlp": 0.01025945, "balance_loss_clip": 1.04352331, "balance_loss_mlp": 1.01943922, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 3.3863640845531027, "language_loss": 0.7778604, "learning_rate": 2.1135138980505384e-07, "loss": 0.79947221, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 2.573951482772827 }, { "auxiliary_loss_clip": 0.01131535, "auxiliary_loss_mlp": 0.010251, "balance_loss_clip": 1.04565728, "balance_loss_mlp": 1.01852882, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 1.8436497252679986, "language_loss": 0.71927702, "learning_rate": 2.110029985266395e-07, "loss": 0.74084342, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.5380666255950928 }, { "auxiliary_loss_clip": 0.01134813, "auxiliary_loss_mlp": 0.01027905, "balance_loss_clip": 1.04209256, "balance_loss_mlp": 1.02104497, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 1.5628647635365924, "language_loss": 0.73796213, "learning_rate": 2.1065487863764787e-07, "loss": 0.75958925, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.5043489933013916 }, { "auxiliary_loss_clip": 0.01096582, "auxiliary_loss_mlp": 0.01025983, "balance_loss_clip": 1.03591514, "balance_loss_mlp": 1.01878858, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 1.5067117999831743, "language_loss": 0.85849237, "learning_rate": 2.1030703019088846e-07, "loss": 0.87971801, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 2.621246814727783 }, { "auxiliary_loss_clip": 0.01145217, "auxiliary_loss_mlp": 0.01023368, "balance_loss_clip": 1.04568398, "balance_loss_mlp": 1.01700521, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 1.9673981870137627, "language_loss": 0.70281518, "learning_rate": 2.099594532391291e-07, "loss": 0.72450101, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 3.2773783206939697 }, { "auxiliary_loss_clip": 0.01139667, "auxiliary_loss_mlp": 0.01026365, "balance_loss_clip": 1.04317117, "balance_loss_mlp": 1.0194869, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 1.5917602929765993, "language_loss": 0.78984076, "learning_rate": 2.0961214783509806e-07, "loss": 0.81150109, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 3.3909220695495605 }, { "auxiliary_loss_clip": 0.01138973, "auxiliary_loss_mlp": 0.01025985, "balance_loss_clip": 1.04312313, "balance_loss_mlp": 1.01932716, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 1.7295013111161173, "language_loss": 0.74488258, "learning_rate": 2.0926511403148051e-07, "loss": 0.76653218, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.5754666328430176 }, { "auxiliary_loss_clip": 0.01126065, "auxiliary_loss_mlp": 0.01024752, "balance_loss_clip": 1.04527652, "balance_loss_mlp": 1.01785576, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 2.366629418189321, "language_loss": 0.75903332, "learning_rate": 2.0891835188092143e-07, "loss": 0.78054148, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 3.2505407333374023 }, { "auxiliary_loss_clip": 0.01122528, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.04231679, "balance_loss_mlp": 1.02390361, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 1.8328221872097745, "language_loss": 0.81166589, "learning_rate": 2.0857186143602434e-07, "loss": 0.83319938, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.5705645084381104 }, { "auxiliary_loss_clip": 0.01112901, "auxiliary_loss_mlp": 0.01028832, "balance_loss_clip": 1.04051077, "balance_loss_mlp": 1.02184939, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 1.7158710048723405, "language_loss": 0.67678046, "learning_rate": 2.0822564274935094e-07, "loss": 0.69819784, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 2.614271402359009 }, { "auxiliary_loss_clip": 0.01133974, "auxiliary_loss_mlp": 0.01025745, "balance_loss_clip": 1.04660535, "balance_loss_mlp": 1.01848531, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 1.8615872723854747, "language_loss": 0.6697405, "learning_rate": 2.078796958734239e-07, "loss": 0.69133765, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.666271209716797 }, { "auxiliary_loss_clip": 0.01149946, "auxiliary_loss_mlp": 0.01023013, "balance_loss_clip": 1.04750943, "balance_loss_mlp": 1.01553857, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 2.406507315281437, "language_loss": 0.74623871, "learning_rate": 2.0753402086072124e-07, "loss": 0.7679683, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 2.499075412750244 }, { "auxiliary_loss_clip": 0.01088025, "auxiliary_loss_mlp": 0.01024756, "balance_loss_clip": 1.04157615, "balance_loss_mlp": 1.01823533, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 2.077459825572958, "language_loss": 0.75259149, "learning_rate": 2.071886177636828e-07, "loss": 0.77371931, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.671675682067871 }, { "auxiliary_loss_clip": 0.01148004, "auxiliary_loss_mlp": 0.01025204, "balance_loss_clip": 1.04689789, "balance_loss_mlp": 1.01775932, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.7337138732927413, "language_loss": 0.83015734, "learning_rate": 2.0684348663470575e-07, "loss": 0.85188937, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.5330188274383545 }, { "auxiliary_loss_clip": 0.01133406, "auxiliary_loss_mlp": 0.01022961, "balance_loss_clip": 1.04053998, "balance_loss_mlp": 1.01565957, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.8442432114910252, "language_loss": 0.617109, "learning_rate": 2.0649862752614555e-07, "loss": 0.63867271, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.532924175262451 }, { "auxiliary_loss_clip": 0.01036272, "auxiliary_loss_mlp": 0.01000487, "balance_loss_clip": 1.00809002, "balance_loss_mlp": 0.99937803, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.7534611323741901, "language_loss": 0.57017314, "learning_rate": 2.0615404049031838e-07, "loss": 0.59054077, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.1308369636535645 }, { "auxiliary_loss_clip": 0.01149417, "auxiliary_loss_mlp": 0.01030282, "balance_loss_clip": 1.04635549, "balance_loss_mlp": 1.02277768, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 3.7116436035198097, "language_loss": 0.78028667, "learning_rate": 2.0580972557949616e-07, "loss": 0.80208361, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.475665330886841 }, { "auxiliary_loss_clip": 0.01046147, "auxiliary_loss_mlp": 0.01002556, "balance_loss_clip": 1.00756681, "balance_loss_mlp": 1.0013814, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.7944029044256636, "language_loss": 0.54260665, "learning_rate": 2.054656828459125e-07, "loss": 0.56309366, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.133776903152466 }, { "auxiliary_loss_clip": 0.01105441, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.04197574, "balance_loss_mlp": 1.02430773, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 1.7529046913003, "language_loss": 0.77421844, "learning_rate": 2.051219123417578e-07, "loss": 0.79558587, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.662224292755127 }, { "auxiliary_loss_clip": 0.01163305, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.04620028, "balance_loss_mlp": 1.01994205, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 2.11316025719277, "language_loss": 0.60273933, "learning_rate": 2.0477841411918196e-07, "loss": 0.62464261, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.5115489959716797 }, { "auxiliary_loss_clip": 0.01143798, "auxiliary_loss_mlp": 0.01023313, "balance_loss_clip": 1.04434276, "balance_loss_mlp": 1.01639009, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 2.1324422584551685, "language_loss": 0.7459327, "learning_rate": 2.0443518823029326e-07, "loss": 0.76760381, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 3.3944649696350098 }, { "auxiliary_loss_clip": 0.01115936, "auxiliary_loss_mlp": 0.01025259, "balance_loss_clip": 1.04212904, "balance_loss_mlp": 1.01859534, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 1.970716338473445, "language_loss": 0.76516694, "learning_rate": 2.0409223472715854e-07, "loss": 0.78657889, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.541827440261841 }, { "auxiliary_loss_clip": 0.01120478, "auxiliary_loss_mlp": 0.00760422, "balance_loss_clip": 1.04346418, "balance_loss_mlp": 1.00031924, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 1.8965006049407658, "language_loss": 0.74906409, "learning_rate": 2.0374955366180434e-07, "loss": 0.76787305, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.5774054527282715 }, { "auxiliary_loss_clip": 0.01124535, "auxiliary_loss_mlp": 0.01023958, "balance_loss_clip": 1.04115152, "balance_loss_mlp": 1.01673687, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.6816025136425539, "language_loss": 0.72729921, "learning_rate": 2.034071450862147e-07, "loss": 0.74878418, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.5772314071655273 }, { "auxiliary_loss_clip": 0.01137763, "auxiliary_loss_mlp": 0.01031603, "balance_loss_clip": 1.04227948, "balance_loss_mlp": 1.02411664, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 1.6261222756730747, "language_loss": 0.76445735, "learning_rate": 2.030650090523327e-07, "loss": 0.78615105, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 2.559288501739502 }, { "auxiliary_loss_clip": 0.01117664, "auxiliary_loss_mlp": 0.01025041, "balance_loss_clip": 1.04130793, "balance_loss_mlp": 1.01757252, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 1.6305425960348283, "language_loss": 0.59612447, "learning_rate": 2.0272314561205995e-07, "loss": 0.61755157, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.6614830493927 }, { "auxiliary_loss_clip": 0.01114892, "auxiliary_loss_mlp": 0.01027519, "balance_loss_clip": 1.04010403, "balance_loss_mlp": 1.0208348, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 1.7353291012423602, "language_loss": 0.72873402, "learning_rate": 2.023815548172567e-07, "loss": 0.75015807, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 2.6153972148895264 }, { "auxiliary_loss_clip": 0.01147916, "auxiliary_loss_mlp": 0.01029167, "balance_loss_clip": 1.04471421, "balance_loss_mlp": 1.02218127, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 1.6880678954411976, "language_loss": 0.65823734, "learning_rate": 2.0204023671974267e-07, "loss": 0.68000817, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.5435783863067627 }, { "auxiliary_loss_clip": 0.01144195, "auxiliary_loss_mlp": 0.01025976, "balance_loss_clip": 1.04437792, "balance_loss_mlp": 1.01927686, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 2.0465524089375666, "language_loss": 0.80985057, "learning_rate": 2.0169919137129532e-07, "loss": 0.83155227, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.4748005867004395 }, { "auxiliary_loss_clip": 0.01153599, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.04918921, "balance_loss_mlp": 1.02031088, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 2.057025864587331, "language_loss": 0.70629394, "learning_rate": 2.013584188236508e-07, "loss": 0.72811031, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 2.539083957672119 }, { "auxiliary_loss_clip": 0.01165331, "auxiliary_loss_mlp": 0.01025339, "balance_loss_clip": 1.04790735, "balance_loss_mlp": 1.01880956, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 1.645221947012553, "language_loss": 0.79536462, "learning_rate": 2.0101791912850396e-07, "loss": 0.81727135, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 3.2552435398101807 }, { "auxiliary_loss_clip": 0.01137741, "auxiliary_loss_mlp": 0.01023919, "balance_loss_clip": 1.04589856, "balance_loss_mlp": 1.01667988, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 1.8505876380047412, "language_loss": 0.63898826, "learning_rate": 2.006776923375082e-07, "loss": 0.66060489, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 3.4595329761505127 }, { "auxiliary_loss_clip": 0.01161807, "auxiliary_loss_mlp": 0.01023509, "balance_loss_clip": 1.04643798, "balance_loss_mlp": 1.01657653, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.7182304424495394, "language_loss": 0.71041155, "learning_rate": 2.003377385022764e-07, "loss": 0.73226476, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.4753329753875732 }, { "auxiliary_loss_clip": 0.01138003, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.04505718, "balance_loss_mlp": 1.02117944, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 2.189964845654228, "language_loss": 0.77302706, "learning_rate": 1.9999805767437826e-07, "loss": 0.79468441, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 3.230210304260254 }, { "auxiliary_loss_clip": 0.01126757, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 1.0406692, "balance_loss_mlp": 1.01650059, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 1.718491510576672, "language_loss": 0.71505129, "learning_rate": 1.9965864990534386e-07, "loss": 0.73655653, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 2.568971633911133 }, { "auxiliary_loss_clip": 0.01113702, "auxiliary_loss_mlp": 0.01025179, "balance_loss_clip": 1.03934407, "balance_loss_mlp": 1.01833916, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.8088948711517434, "language_loss": 0.77564454, "learning_rate": 1.9931951524666092e-07, "loss": 0.79703337, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.66259765625 }, { "auxiliary_loss_clip": 0.01151101, "auxiliary_loss_mlp": 0.00760674, "balance_loss_clip": 1.0458535, "balance_loss_mlp": 1.00031376, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.9462646814776654, "language_loss": 0.81033158, "learning_rate": 1.9898065374977534e-07, "loss": 0.82944936, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 2.5024962425231934 }, { "auxiliary_loss_clip": 0.01114793, "auxiliary_loss_mlp": 0.01027037, "balance_loss_clip": 1.04261041, "balance_loss_mlp": 1.02126765, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 1.9129355287467098, "language_loss": 0.73095334, "learning_rate": 1.9864206546609342e-07, "loss": 0.75237167, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.5485823154449463 }, { "auxiliary_loss_clip": 0.01160705, "auxiliary_loss_mlp": 0.01024433, "balance_loss_clip": 1.04544783, "balance_loss_mlp": 1.01780248, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 1.7484356839992945, "language_loss": 0.84227335, "learning_rate": 1.983037504469771e-07, "loss": 0.86412477, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.485396385192871 }, { "auxiliary_loss_clip": 0.0114997, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.04729593, "balance_loss_mlp": 1.02391887, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 1.624706848745363, "language_loss": 0.6614362, "learning_rate": 1.9796570874374984e-07, "loss": 0.68324602, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.5103588104248047 }, { "auxiliary_loss_clip": 0.01139432, "auxiliary_loss_mlp": 0.01024299, "balance_loss_clip": 1.04523063, "balance_loss_mlp": 1.01716483, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 1.6307874990096964, "language_loss": 0.77630866, "learning_rate": 1.976279404076917e-07, "loss": 0.79794598, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.554715633392334 }, { "auxiliary_loss_clip": 0.01124318, "auxiliary_loss_mlp": 0.01025132, "balance_loss_clip": 1.0472374, "balance_loss_mlp": 1.0178659, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 2.357895460210842, "language_loss": 0.76363385, "learning_rate": 1.9729044549004193e-07, "loss": 0.78512836, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.6374943256378174 }, { "auxiliary_loss_clip": 0.01148962, "auxiliary_loss_mlp": 0.01023188, "balance_loss_clip": 1.04756629, "balance_loss_mlp": 1.01684594, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 1.6244525444004287, "language_loss": 0.70402241, "learning_rate": 1.9695322404199822e-07, "loss": 0.72574383, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.563080310821533 }, { "auxiliary_loss_clip": 0.01132401, "auxiliary_loss_mlp": 0.01027166, "balance_loss_clip": 1.04343009, "balance_loss_mlp": 1.02015662, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 1.7686423999246503, "language_loss": 0.82133794, "learning_rate": 1.9661627611471654e-07, "loss": 0.84293365, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.589808940887451 }, { "auxiliary_loss_clip": 0.01136262, "auxiliary_loss_mlp": 0.01027777, "balance_loss_clip": 1.04253578, "balance_loss_mlp": 1.02052867, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 1.7436370851333989, "language_loss": 0.70214409, "learning_rate": 1.9627960175931246e-07, "loss": 0.72378445, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 2.778090238571167 }, { "auxiliary_loss_clip": 0.01149953, "auxiliary_loss_mlp": 0.01032786, "balance_loss_clip": 1.04799402, "balance_loss_mlp": 1.02640569, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 1.7519167904296022, "language_loss": 0.74137694, "learning_rate": 1.9594320102685847e-07, "loss": 0.76320434, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.523455858230591 }, { "auxiliary_loss_clip": 0.01126196, "auxiliary_loss_mlp": 0.00760415, "balance_loss_clip": 1.04237628, "balance_loss_mlp": 1.00030947, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 1.9912862530119098, "language_loss": 0.63854975, "learning_rate": 1.956070739683864e-07, "loss": 0.65741587, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 3.30953311920166 }, { "auxiliary_loss_clip": 0.01102564, "auxiliary_loss_mlp": 0.01025597, "balance_loss_clip": 1.03730941, "balance_loss_mlp": 1.01909995, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 1.450767929302588, "language_loss": 0.74171996, "learning_rate": 1.9527122063488678e-07, "loss": 0.76300162, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.596179723739624 }, { "auxiliary_loss_clip": 0.0113314, "auxiliary_loss_mlp": 0.01030063, "balance_loss_clip": 1.04101849, "balance_loss_mlp": 1.02302372, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 1.5313159819419662, "language_loss": 0.80136704, "learning_rate": 1.9493564107730755e-07, "loss": 0.822999, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.554286241531372 }, { "auxiliary_loss_clip": 0.01128331, "auxiliary_loss_mlp": 0.01029172, "balance_loss_clip": 1.04055667, "balance_loss_mlp": 1.02266896, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 1.8678988506650676, "language_loss": 0.60632885, "learning_rate": 1.9460033534655684e-07, "loss": 0.62790382, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.536045789718628 }, { "auxiliary_loss_clip": 0.01128965, "auxiliary_loss_mlp": 0.01022836, "balance_loss_clip": 1.03986394, "balance_loss_mlp": 1.0160799, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 1.5551753517803542, "language_loss": 0.84248042, "learning_rate": 1.9426530349349978e-07, "loss": 0.86399841, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.5564353466033936 }, { "auxiliary_loss_clip": 0.01147658, "auxiliary_loss_mlp": 0.00760563, "balance_loss_clip": 1.04448867, "balance_loss_mlp": 1.00027418, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 1.7960932922775064, "language_loss": 0.64517534, "learning_rate": 1.9393054556896038e-07, "loss": 0.66425753, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.4777421951293945 }, { "auxiliary_loss_clip": 0.01111922, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.03892016, "balance_loss_mlp": 1.01940227, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 2.4597074888009947, "language_loss": 0.69052738, "learning_rate": 1.9359606162372133e-07, "loss": 0.7119137, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 2.6304659843444824 }, { "auxiliary_loss_clip": 0.011631, "auxiliary_loss_mlp": 0.01024432, "balance_loss_clip": 1.04791057, "balance_loss_mlp": 1.01792622, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 1.6427269285059034, "language_loss": 0.70653886, "learning_rate": 1.9326185170852293e-07, "loss": 0.72841418, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 2.450962543487549 }, { "auxiliary_loss_clip": 0.01147327, "auxiliary_loss_mlp": 0.01024502, "balance_loss_clip": 1.04465795, "balance_loss_mlp": 1.0177424, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 1.7354827706606928, "language_loss": 0.71969587, "learning_rate": 1.9292791587406598e-07, "loss": 0.74141419, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.542449712753296 }, { "auxiliary_loss_clip": 0.01146913, "auxiliary_loss_mlp": 0.00760853, "balance_loss_clip": 1.04395747, "balance_loss_mlp": 1.00032067, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 2.174441748891654, "language_loss": 0.86911088, "learning_rate": 1.9259425417100661e-07, "loss": 0.8881886, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 2.465039014816284 }, { "auxiliary_loss_clip": 0.01087324, "auxiliary_loss_mlp": 0.01023671, "balance_loss_clip": 1.03266001, "balance_loss_mlp": 1.01615477, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 2.065646443381123, "language_loss": 0.74722522, "learning_rate": 1.9226086664996234e-07, "loss": 0.76833522, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 3.3828134536743164 }, { "auxiliary_loss_clip": 0.01135143, "auxiliary_loss_mlp": 0.01028439, "balance_loss_clip": 1.04637051, "balance_loss_mlp": 1.02157855, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 1.8530909394369006, "language_loss": 0.74166, "learning_rate": 1.9192775336150712e-07, "loss": 0.76329589, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 3.3846585750579834 }, { "auxiliary_loss_clip": 0.01044247, "auxiliary_loss_mlp": 0.01001704, "balance_loss_clip": 1.00890756, "balance_loss_mlp": 1.00060701, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7625399855014642, "language_loss": 0.56324816, "learning_rate": 1.915949143561739e-07, "loss": 0.58370757, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.0729289054870605 }, { "auxiliary_loss_clip": 0.01150624, "auxiliary_loss_mlp": 0.01028061, "balance_loss_clip": 1.04870343, "balance_loss_mlp": 1.02099204, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 1.6264489840898542, "language_loss": 0.77925324, "learning_rate": 1.9126234968445498e-07, "loss": 0.80104005, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 3.2266340255737305 }, { "auxiliary_loss_clip": 0.01163852, "auxiliary_loss_mlp": 0.01026734, "balance_loss_clip": 1.0477159, "balance_loss_mlp": 1.01962566, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.4169971357044704, "language_loss": 0.67579746, "learning_rate": 1.9093005939679884e-07, "loss": 0.6977033, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.521273136138916 }, { "auxiliary_loss_clip": 0.01149342, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.04720998, "balance_loss_mlp": 1.0226264, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 1.8571380024509778, "language_loss": 0.76450729, "learning_rate": 1.9059804354361452e-07, "loss": 0.78629959, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.479379177093506 }, { "auxiliary_loss_clip": 0.01127155, "auxiliary_loss_mlp": 0.01024781, "balance_loss_clip": 1.04011893, "balance_loss_mlp": 1.01746726, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 1.4604738585374826, "language_loss": 0.69936788, "learning_rate": 1.902663021752684e-07, "loss": 0.72088724, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.6127097606658936 }, { "auxiliary_loss_clip": 0.01166611, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.04968369, "balance_loss_mlp": 1.02223361, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 2.4796348886020487, "language_loss": 0.8204428, "learning_rate": 1.8993483534208556e-07, "loss": 0.84240103, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 2.454071283340454 }, { "auxiliary_loss_clip": 0.01126795, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.04307425, "balance_loss_mlp": 1.01888561, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 2.390853335229563, "language_loss": 0.74767578, "learning_rate": 1.8960364309434884e-07, "loss": 0.76920849, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 2.4886999130249023 }, { "auxiliary_loss_clip": 0.01088778, "auxiliary_loss_mlp": 0.00760531, "balance_loss_clip": 1.03890038, "balance_loss_mlp": 1.00032711, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.8257697527748729, "language_loss": 0.77975142, "learning_rate": 1.8927272548229967e-07, "loss": 0.79824448, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.6455118656158447 }, { "auxiliary_loss_clip": 0.01107029, "auxiliary_loss_mlp": 0.0102969, "balance_loss_clip": 1.04161167, "balance_loss_mlp": 1.02276349, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.5106593763284248, "language_loss": 0.82770705, "learning_rate": 1.8894208255613876e-07, "loss": 0.84907424, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.5900216102600098 }, { "auxiliary_loss_clip": 0.01162469, "auxiliary_loss_mlp": 0.01022367, "balance_loss_clip": 1.04790473, "balance_loss_mlp": 1.01545596, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 2.107367405147276, "language_loss": 0.77647632, "learning_rate": 1.8861171436602397e-07, "loss": 0.7983247, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.4750211238861084 }, { "auxiliary_loss_clip": 0.01153324, "auxiliary_loss_mlp": 0.01026343, "balance_loss_clip": 1.04839897, "balance_loss_mlp": 1.01892233, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.0752615694797343, "language_loss": 0.80177224, "learning_rate": 1.882816209620719e-07, "loss": 0.82356888, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.5357677936553955 }, { "auxiliary_loss_clip": 0.01136724, "auxiliary_loss_mlp": 0.01029723, "balance_loss_clip": 1.04676938, "balance_loss_mlp": 1.02255547, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 2.3821477184722033, "language_loss": 0.76528919, "learning_rate": 1.8795180239435738e-07, "loss": 0.78695363, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.5449724197387695 }, { "auxiliary_loss_clip": 0.01140298, "auxiliary_loss_mlp": 0.01026128, "balance_loss_clip": 1.04501629, "balance_loss_mlp": 1.01888013, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 2.676935995629338, "language_loss": 0.75432432, "learning_rate": 1.8762225871291348e-07, "loss": 0.77598858, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.545774221420288 }, { "auxiliary_loss_clip": 0.01162862, "auxiliary_loss_mlp": 0.00760194, "balance_loss_clip": 1.04724598, "balance_loss_mlp": 1.00032914, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.5858781981162235, "language_loss": 0.8098979, "learning_rate": 1.8729298996773201e-07, "loss": 0.82912844, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 2.482956647872925 }, { "auxiliary_loss_clip": 0.01041735, "auxiliary_loss_mlp": 0.01002018, "balance_loss_clip": 1.00716245, "balance_loss_mlp": 1.00096309, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.8299362979036086, "language_loss": 0.60961998, "learning_rate": 1.8696399620876301e-07, "loss": 0.63005751, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 3.767765998840332 }, { "auxiliary_loss_clip": 0.01114697, "auxiliary_loss_mlp": 0.01033465, "balance_loss_clip": 1.0368278, "balance_loss_mlp": 1.02621162, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 2.08357895748252, "language_loss": 0.79087007, "learning_rate": 1.866352774859141e-07, "loss": 0.8123517, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.595792055130005 }, { "auxiliary_loss_clip": 0.01123441, "auxiliary_loss_mlp": 0.01028232, "balance_loss_clip": 1.0400691, "balance_loss_mlp": 1.02196491, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.496611850782563, "language_loss": 0.68925655, "learning_rate": 1.8630683384905188e-07, "loss": 0.71077329, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.5724236965179443 }, { "auxiliary_loss_clip": 0.01164583, "auxiliary_loss_mlp": 0.00760869, "balance_loss_clip": 1.04868424, "balance_loss_mlp": 1.00029278, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 1.8847250079531883, "language_loss": 0.88662255, "learning_rate": 1.8597866534800045e-07, "loss": 0.90587711, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.444260358810425 }, { "auxiliary_loss_clip": 0.01151253, "auxiliary_loss_mlp": 0.00760658, "balance_loss_clip": 1.04677773, "balance_loss_mlp": 1.00032926, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 2.5001653280417075, "language_loss": 0.74532473, "learning_rate": 1.8565077203254398e-07, "loss": 0.76444381, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 2.9021856784820557 }, { "auxiliary_loss_clip": 0.01123032, "auxiliary_loss_mlp": 0.0102458, "balance_loss_clip": 1.04627419, "balance_loss_mlp": 1.01723373, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 2.4513084309942155, "language_loss": 0.72994065, "learning_rate": 1.8532315395242203e-07, "loss": 0.75141674, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.5436387062072754 }, { "auxiliary_loss_clip": 0.01124275, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04457521, "balance_loss_mlp": 1.01704943, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 2.0089935093888944, "language_loss": 0.7217328, "learning_rate": 1.849958111573353e-07, "loss": 0.74321198, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 2.558933734893799 }, { "auxiliary_loss_clip": 0.01160438, "auxiliary_loss_mlp": 0.01022006, "balance_loss_clip": 1.04565239, "balance_loss_mlp": 1.01527071, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.7138758142657744, "language_loss": 0.63733268, "learning_rate": 1.8466874369694074e-07, "loss": 0.65915704, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 2.434722900390625 }, { "auxiliary_loss_clip": 0.01120993, "auxiliary_loss_mlp": 0.01024519, "balance_loss_clip": 1.03907883, "balance_loss_mlp": 1.01790857, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 3.38495134390697, "language_loss": 0.70397162, "learning_rate": 1.843419516208542e-07, "loss": 0.72542667, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.530596971511841 }, { "auxiliary_loss_clip": 0.01150419, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.04728866, "balance_loss_mlp": 1.01916122, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 2.1846291023592093, "language_loss": 0.79875207, "learning_rate": 1.8401543497865047e-07, "loss": 0.82051933, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 2.460141658782959 }, { "auxiliary_loss_clip": 0.01152635, "auxiliary_loss_mlp": 0.00760449, "balance_loss_clip": 1.04596853, "balance_loss_mlp": 1.00027347, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 2.6006692552746062, "language_loss": 0.64386648, "learning_rate": 1.836891938198608e-07, "loss": 0.66299731, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 3.4057466983795166 }, { "auxiliary_loss_clip": 0.01137899, "auxiliary_loss_mlp": 0.01024011, "balance_loss_clip": 1.0468868, "balance_loss_mlp": 1.01726997, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.075565037425898, "language_loss": 0.71056598, "learning_rate": 1.8336322819397677e-07, "loss": 0.73218513, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 3.3505895137786865 }, { "auxiliary_loss_clip": 0.01125565, "auxiliary_loss_mlp": 0.0102353, "balance_loss_clip": 1.04092216, "balance_loss_mlp": 1.0164696, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 1.8136044401186209, "language_loss": 0.62710959, "learning_rate": 1.8303753815044654e-07, "loss": 0.64860058, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.5447394847869873 }, { "auxiliary_loss_clip": 0.01140455, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.04178154, "balance_loss_mlp": 1.02441347, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 2.1561740829742746, "language_loss": 0.70651734, "learning_rate": 1.827121237386773e-07, "loss": 0.72824514, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 3.283843994140625 }, { "auxiliary_loss_clip": 0.01138802, "auxiliary_loss_mlp": 0.01027471, "balance_loss_clip": 1.04406834, "balance_loss_mlp": 1.02012753, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 3.4408673816974544, "language_loss": 0.75140381, "learning_rate": 1.8238698500803374e-07, "loss": 0.77306652, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.497735023498535 }, { "auxiliary_loss_clip": 0.01046422, "auxiliary_loss_mlp": 0.01001141, "balance_loss_clip": 1.00775504, "balance_loss_mlp": 1.00004983, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7159631124734989, "language_loss": 0.56328845, "learning_rate": 1.820621220078391e-07, "loss": 0.58376408, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.159484386444092 }, { "auxiliary_loss_clip": 0.01163022, "auxiliary_loss_mlp": 0.01024133, "balance_loss_clip": 1.0477432, "balance_loss_mlp": 1.01754701, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 2.1140888384569267, "language_loss": 0.67794698, "learning_rate": 1.8173753478737553e-07, "loss": 0.69981849, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.4833598136901855 }, { "auxiliary_loss_clip": 0.01162316, "auxiliary_loss_mlp": 0.01026756, "balance_loss_clip": 1.0463382, "balance_loss_mlp": 1.02006269, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 1.9247443315881014, "language_loss": 0.79533732, "learning_rate": 1.8141322339588205e-07, "loss": 0.81722808, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 2.4648218154907227 }, { "auxiliary_loss_clip": 0.01162547, "auxiliary_loss_mlp": 0.01027554, "balance_loss_clip": 1.04711163, "balance_loss_mlp": 1.02078879, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 2.1880669250889673, "language_loss": 0.70154804, "learning_rate": 1.810891878825569e-07, "loss": 0.72344899, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 2.4922640323638916 }, { "auxiliary_loss_clip": 0.01133835, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 1.04262519, "balance_loss_mlp": 1.01716864, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 1.806592876116107, "language_loss": 0.719926, "learning_rate": 1.8076542829655561e-07, "loss": 0.74150741, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.547414541244507 }, { "auxiliary_loss_clip": 0.01132588, "auxiliary_loss_mlp": 0.01026213, "balance_loss_clip": 1.04416513, "balance_loss_mlp": 1.01905465, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 1.9667870584154197, "language_loss": 0.79285955, "learning_rate": 1.8044194468699203e-07, "loss": 0.81444758, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.4898715019226074 }, { "auxiliary_loss_clip": 0.0113438, "auxiliary_loss_mlp": 0.01024173, "balance_loss_clip": 1.04806554, "balance_loss_mlp": 1.01705623, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 2.4406140402838963, "language_loss": 0.760023, "learning_rate": 1.8011873710293912e-07, "loss": 0.78160852, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.5345187187194824 }, { "auxiliary_loss_clip": 0.01145642, "auxiliary_loss_mlp": 0.01026166, "balance_loss_clip": 1.04529572, "balance_loss_mlp": 1.01940346, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 1.8993937492551125, "language_loss": 0.70036644, "learning_rate": 1.7979580559342677e-07, "loss": 0.72208452, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.5918309688568115 }, { "auxiliary_loss_clip": 0.01136944, "auxiliary_loss_mlp": 0.01032871, "balance_loss_clip": 1.04674459, "balance_loss_mlp": 1.02596259, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 1.717289454406398, "language_loss": 0.67124176, "learning_rate": 1.7947315020744358e-07, "loss": 0.69293994, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.573223114013672 }, { "auxiliary_loss_clip": 0.01134693, "auxiliary_loss_mlp": 0.01028125, "balance_loss_clip": 1.04282212, "balance_loss_mlp": 1.02131534, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 2.550953516243246, "language_loss": 0.80306327, "learning_rate": 1.7915077099393594e-07, "loss": 0.82469141, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.5091428756713867 }, { "auxiliary_loss_clip": 0.01151914, "auxiliary_loss_mlp": 0.01022406, "balance_loss_clip": 1.0460459, "balance_loss_mlp": 1.01520848, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 1.8681559982762532, "language_loss": 0.7291832, "learning_rate": 1.788286680018083e-07, "loss": 0.75092638, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.475740909576416 }, { "auxiliary_loss_clip": 0.01138452, "auxiliary_loss_mlp": 0.0102294, "balance_loss_clip": 1.04483795, "balance_loss_mlp": 1.01639581, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 1.7817053768466888, "language_loss": 0.72261524, "learning_rate": 1.7850684127992443e-07, "loss": 0.74422914, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 3.424898624420166 }, { "auxiliary_loss_clip": 0.01125564, "auxiliary_loss_mlp": 0.01026392, "balance_loss_clip": 1.04567873, "balance_loss_mlp": 1.02003193, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 1.5172550678104952, "language_loss": 0.70296121, "learning_rate": 1.7818529087710378e-07, "loss": 0.72448081, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.558955192565918 }, { "auxiliary_loss_clip": 0.01147351, "auxiliary_loss_mlp": 0.00760315, "balance_loss_clip": 1.0452261, "balance_loss_mlp": 1.00029361, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 1.606313478254771, "language_loss": 0.84044576, "learning_rate": 1.7786401684212637e-07, "loss": 0.85952246, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.488352060317993 }, { "auxiliary_loss_clip": 0.01016361, "auxiliary_loss_mlp": 0.01001609, "balance_loss_clip": 1.00843859, "balance_loss_mlp": 1.00060809, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.7307650601745384, "language_loss": 0.55968791, "learning_rate": 1.7754301922372883e-07, "loss": 0.5798676, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 2.993025302886963 }, { "auxiliary_loss_clip": 0.01099023, "auxiliary_loss_mlp": 0.01028326, "balance_loss_clip": 1.04020715, "balance_loss_mlp": 1.02177811, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 1.9151947619700873, "language_loss": 0.80918247, "learning_rate": 1.7722229807060617e-07, "loss": 0.83045596, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.6684646606445312 }, { "auxiliary_loss_clip": 0.01110953, "auxiliary_loss_mlp": 0.01025777, "balance_loss_clip": 1.0392493, "balance_loss_mlp": 1.01916742, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 2.1039563268406907, "language_loss": 0.81963575, "learning_rate": 1.7690185343141172e-07, "loss": 0.84100306, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.6655049324035645 }, { "auxiliary_loss_clip": 0.01132452, "auxiliary_loss_mlp": 0.01020463, "balance_loss_clip": 1.04117429, "balance_loss_mlp": 1.01415396, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 1.8391238964722423, "language_loss": 0.69922686, "learning_rate": 1.7658168535475615e-07, "loss": 0.72075593, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 2.526275873184204 }, { "auxiliary_loss_clip": 0.01139718, "auxiliary_loss_mlp": 0.01023121, "balance_loss_clip": 1.04562187, "balance_loss_mlp": 1.01567912, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 1.5561703106822824, "language_loss": 0.64362925, "learning_rate": 1.7626179388920948e-07, "loss": 0.66525763, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 2.58770751953125 }, { "auxiliary_loss_clip": 0.0113446, "auxiliary_loss_mlp": 0.00760462, "balance_loss_clip": 1.04429674, "balance_loss_mlp": 1.00034571, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.82827266680265, "language_loss": 0.80581683, "learning_rate": 1.7594217908329866e-07, "loss": 0.82476604, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.6049158573150635 }, { "auxiliary_loss_clip": 0.01127777, "auxiliary_loss_mlp": 0.01025967, "balance_loss_clip": 1.04500175, "balance_loss_mlp": 1.01966715, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 1.9231715000523202, "language_loss": 0.73999178, "learning_rate": 1.7562284098550895e-07, "loss": 0.76152921, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 2.5568931102752686 }, { "auxiliary_loss_clip": 0.01033308, "auxiliary_loss_mlp": 0.01001143, "balance_loss_clip": 1.01137984, "balance_loss_mlp": 1.0001235, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.8342698516678518, "language_loss": 0.62279356, "learning_rate": 1.753037796442838e-07, "loss": 0.64313805, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 3.8936808109283447 }, { "auxiliary_loss_clip": 0.01162594, "auxiliary_loss_mlp": 0.01024842, "balance_loss_clip": 1.04615569, "balance_loss_mlp": 1.01771581, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.2405228660512444, "language_loss": 0.75602138, "learning_rate": 1.74984995108024e-07, "loss": 0.77789569, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 3.3659839630126953 }, { "auxiliary_loss_clip": 0.01151038, "auxiliary_loss_mlp": 0.01020654, "balance_loss_clip": 1.04601598, "balance_loss_mlp": 1.01401091, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 2.0295843381939247, "language_loss": 0.83074689, "learning_rate": 1.7466648742508981e-07, "loss": 0.85246378, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 3.2632036209106445 }, { "auxiliary_loss_clip": 0.01136602, "auxiliary_loss_mlp": 0.01022528, "balance_loss_clip": 1.04718876, "balance_loss_mlp": 1.01541114, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 1.930149562551747, "language_loss": 0.84745878, "learning_rate": 1.7434825664379837e-07, "loss": 0.86905009, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.5758790969848633 }, { "auxiliary_loss_clip": 0.01149112, "auxiliary_loss_mlp": 0.01024811, "balance_loss_clip": 1.04621935, "balance_loss_mlp": 1.0177542, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 5.281286732995591, "language_loss": 0.85937786, "learning_rate": 1.740303028124246e-07, "loss": 0.88111711, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 2.455282688140869 }, { "auxiliary_loss_clip": 0.01081065, "auxiliary_loss_mlp": 0.01022418, "balance_loss_clip": 1.03545618, "balance_loss_mlp": 1.01576614, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 2.121458099190664, "language_loss": 0.75855213, "learning_rate": 1.7371262597920212e-07, "loss": 0.77958697, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.751269578933716 }, { "auxiliary_loss_clip": 0.0110715, "auxiliary_loss_mlp": 0.01029007, "balance_loss_clip": 1.04329109, "balance_loss_mlp": 1.0223968, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.5337485672625784, "language_loss": 0.76407123, "learning_rate": 1.7339522619232195e-07, "loss": 0.78543282, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.5772736072540283 }, { "auxiliary_loss_clip": 0.01141675, "auxiliary_loss_mlp": 0.01023666, "balance_loss_clip": 1.04315519, "balance_loss_mlp": 1.01621556, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 1.8564530937630506, "language_loss": 0.75456315, "learning_rate": 1.730781034999338e-07, "loss": 0.77621663, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 2.614659309387207 }, { "auxiliary_loss_clip": 0.01159973, "auxiliary_loss_mlp": 0.01025625, "balance_loss_clip": 1.04741657, "balance_loss_mlp": 1.01957548, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 1.8920661939198817, "language_loss": 0.73265946, "learning_rate": 1.7276125795014497e-07, "loss": 0.75451541, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 2.5783803462982178 }, { "auxiliary_loss_clip": 0.01137328, "auxiliary_loss_mlp": 0.01025483, "balance_loss_clip": 1.04154158, "balance_loss_mlp": 1.01815808, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 1.8668120412543965, "language_loss": 0.67485589, "learning_rate": 1.7244468959102054e-07, "loss": 0.69648403, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.5282657146453857 }, { "auxiliary_loss_clip": 0.01150574, "auxiliary_loss_mlp": 0.01026613, "balance_loss_clip": 1.04816449, "balance_loss_mlp": 1.01949286, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 2.5127771107444556, "language_loss": 0.85430026, "learning_rate": 1.7212839847058348e-07, "loss": 0.87607217, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.4851412773132324 }, { "auxiliary_loss_clip": 0.01097427, "auxiliary_loss_mlp": 0.01023864, "balance_loss_clip": 1.03967404, "balance_loss_mlp": 1.01634228, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 2.091154607841811, "language_loss": 0.73694909, "learning_rate": 1.718123846368147e-07, "loss": 0.75816202, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.654014825820923 }, { "auxiliary_loss_clip": 0.01136849, "auxiliary_loss_mlp": 0.00759787, "balance_loss_clip": 1.04660261, "balance_loss_mlp": 1.00028503, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 1.5877359297874662, "language_loss": 0.71352363, "learning_rate": 1.714966481376543e-07, "loss": 0.73248994, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.5438969135284424 }, { "auxiliary_loss_clip": 0.01146786, "auxiliary_loss_mlp": 0.01028456, "balance_loss_clip": 1.04411983, "balance_loss_mlp": 1.0216347, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 2.305690620427168, "language_loss": 0.82821083, "learning_rate": 1.7118118902099797e-07, "loss": 0.84996331, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.5696041584014893 }, { "auxiliary_loss_clip": 0.011496, "auxiliary_loss_mlp": 0.01027389, "balance_loss_clip": 1.04541683, "balance_loss_mlp": 1.02093673, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 3.1693591445143356, "language_loss": 0.80885243, "learning_rate": 1.7086600733470146e-07, "loss": 0.83062226, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.5106236934661865 }, { "auxiliary_loss_clip": 0.01144534, "auxiliary_loss_mlp": 0.01022294, "balance_loss_clip": 1.04486835, "balance_loss_mlp": 1.01541829, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 1.7284481822148026, "language_loss": 0.7698341, "learning_rate": 1.7055110312657738e-07, "loss": 0.79150236, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.5389859676361084 }, { "auxiliary_loss_clip": 0.01130089, "auxiliary_loss_mlp": 0.01031512, "balance_loss_clip": 1.04533267, "balance_loss_mlp": 1.0242877, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 3.2894750535904516, "language_loss": 0.74189174, "learning_rate": 1.702364764443962e-07, "loss": 0.76350784, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 3.373906135559082 }, { "auxiliary_loss_clip": 0.01088272, "auxiliary_loss_mlp": 0.01032406, "balance_loss_clip": 1.03628683, "balance_loss_mlp": 1.0250001, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 2.1472587217199792, "language_loss": 0.72478497, "learning_rate": 1.6992212733588685e-07, "loss": 0.74599177, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.6924874782562256 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01025759, "balance_loss_clip": 1.0417192, "balance_loss_mlp": 1.01889896, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 1.8335147948321373, "language_loss": 0.74926996, "learning_rate": 1.6960805584873538e-07, "loss": 0.77082193, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.5589346885681152 }, { "auxiliary_loss_clip": 0.01112045, "auxiliary_loss_mlp": 0.01024346, "balance_loss_clip": 1.04202628, "balance_loss_mlp": 1.01794767, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 1.5388293824582664, "language_loss": 0.78135675, "learning_rate": 1.6929426203058684e-07, "loss": 0.80272067, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.6125285625457764 }, { "auxiliary_loss_clip": 0.01167783, "auxiliary_loss_mlp": 0.00761851, "balance_loss_clip": 1.04758859, "balance_loss_mlp": 1.00033903, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 1.9986094983489777, "language_loss": 0.8006267, "learning_rate": 1.689807459290431e-07, "loss": 0.81992304, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 2.519680976867676 }, { "auxiliary_loss_clip": 0.01137177, "auxiliary_loss_mlp": 0.01022603, "balance_loss_clip": 1.04677093, "balance_loss_mlp": 1.01619864, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 2.102275616940193, "language_loss": 0.70788294, "learning_rate": 1.6866750759166437e-07, "loss": 0.72948074, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.637498140335083 }, { "auxiliary_loss_clip": 0.01115605, "auxiliary_loss_mlp": 0.01021894, "balance_loss_clip": 1.03885508, "balance_loss_mlp": 1.01514077, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.2487136727879977, "language_loss": 0.77408236, "learning_rate": 1.6835454706596865e-07, "loss": 0.79545736, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.539289951324463 }, { "auxiliary_loss_clip": 0.01163837, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 1.04870534, "balance_loss_mlp": 1.02321792, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 1.6274022178453251, "language_loss": 0.7375716, "learning_rate": 1.680418643994317e-07, "loss": 0.75951517, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 2.47017240524292 }, { "auxiliary_loss_clip": 0.01055422, "auxiliary_loss_mlp": 0.01004375, "balance_loss_clip": 1.00833106, "balance_loss_mlp": 1.00327194, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.8912306016425868, "language_loss": 0.64567095, "learning_rate": 1.6772945963948738e-07, "loss": 0.66626889, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.0807254314422607 }, { "auxiliary_loss_clip": 0.01133651, "auxiliary_loss_mlp": 0.01027314, "balance_loss_clip": 1.04584181, "balance_loss_mlp": 1.02052188, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 2.244922022086923, "language_loss": 0.77496481, "learning_rate": 1.6741733283352733e-07, "loss": 0.79657441, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 2.5111660957336426 }, { "auxiliary_loss_clip": 0.01106945, "auxiliary_loss_mlp": 0.01025759, "balance_loss_clip": 1.04205239, "balance_loss_mlp": 1.01835299, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 1.439710622335369, "language_loss": 0.83877206, "learning_rate": 1.6710548402890102e-07, "loss": 0.86009914, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 3.486658811569214 }, { "auxiliary_loss_clip": 0.01168007, "auxiliary_loss_mlp": 0.01029148, "balance_loss_clip": 1.04883504, "balance_loss_mlp": 1.02149796, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 2.003840232552996, "language_loss": 0.66733563, "learning_rate": 1.6679391327291527e-07, "loss": 0.68930721, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.5792763233184814 }, { "auxiliary_loss_clip": 0.01134071, "auxiliary_loss_mlp": 0.01024818, "balance_loss_clip": 1.04126096, "balance_loss_mlp": 1.01827645, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 4.476415297521812, "language_loss": 0.6755181, "learning_rate": 1.6648262061283492e-07, "loss": 0.69710696, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 3.1847312450408936 }, { "auxiliary_loss_clip": 0.01121706, "auxiliary_loss_mlp": 0.01020692, "balance_loss_clip": 1.04145992, "balance_loss_mlp": 1.01428413, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 1.8860919031324312, "language_loss": 0.73345923, "learning_rate": 1.6617160609588353e-07, "loss": 0.75488317, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.5628204345703125 }, { "auxiliary_loss_clip": 0.01136185, "auxiliary_loss_mlp": 0.01023252, "balance_loss_clip": 1.04461658, "balance_loss_mlp": 1.01649022, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 1.8933650243993077, "language_loss": 0.71737623, "learning_rate": 1.6586086976924163e-07, "loss": 0.73897058, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 2.475778341293335 }, { "auxiliary_loss_clip": 0.01149066, "auxiliary_loss_mlp": 0.01026004, "balance_loss_clip": 1.04471374, "balance_loss_mlp": 1.01928341, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 8.937802917663968, "language_loss": 0.78281009, "learning_rate": 1.6555041168004747e-07, "loss": 0.80456078, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.4973249435424805 }, { "auxiliary_loss_clip": 0.01131998, "auxiliary_loss_mlp": 0.0102517, "balance_loss_clip": 1.04360247, "balance_loss_mlp": 1.01860702, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 2.219088872395557, "language_loss": 0.68316293, "learning_rate": 1.6524023187539715e-07, "loss": 0.70473456, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.531047821044922 }, { "auxiliary_loss_clip": 0.01136004, "auxiliary_loss_mlp": 0.01028326, "balance_loss_clip": 1.04479539, "balance_loss_mlp": 1.02110755, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 2.1141748876787134, "language_loss": 0.74912041, "learning_rate": 1.649303304023446e-07, "loss": 0.7707637, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.5353713035583496 }, { "auxiliary_loss_clip": 0.01121373, "auxiliary_loss_mlp": 0.01022918, "balance_loss_clip": 1.04833364, "balance_loss_mlp": 1.01620018, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 1.7126221275916693, "language_loss": 0.78547233, "learning_rate": 1.6462070730790246e-07, "loss": 0.80691528, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.5447070598602295 }, { "auxiliary_loss_clip": 0.01129734, "auxiliary_loss_mlp": 0.0102724, "balance_loss_clip": 1.04041398, "balance_loss_mlp": 1.02030528, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 2.3813467403970052, "language_loss": 0.78627872, "learning_rate": 1.6431136263903912e-07, "loss": 0.80784851, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.502641439437866 }, { "auxiliary_loss_clip": 0.01152097, "auxiliary_loss_mlp": 0.00760964, "balance_loss_clip": 1.04388785, "balance_loss_mlp": 1.0003109, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 1.8710128239649635, "language_loss": 0.73075521, "learning_rate": 1.6400229644268282e-07, "loss": 0.7498858, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.4958252906799316 }, { "auxiliary_loss_clip": 0.01113736, "auxiliary_loss_mlp": 0.0103043, "balance_loss_clip": 1.04555845, "balance_loss_mlp": 1.02325654, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 2.2180817682497422, "language_loss": 0.80776381, "learning_rate": 1.6369350876571852e-07, "loss": 0.82920551, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 2.5387651920318604 }, { "auxiliary_loss_clip": 0.01103537, "auxiliary_loss_mlp": 0.01025569, "balance_loss_clip": 1.03918219, "balance_loss_mlp": 1.01859784, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 2.619662895972596, "language_loss": 0.81535876, "learning_rate": 1.6338499965498874e-07, "loss": 0.83664972, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.6033706665039062 }, { "auxiliary_loss_clip": 0.01113208, "auxiliary_loss_mlp": 0.01027621, "balance_loss_clip": 1.03969812, "balance_loss_mlp": 1.02022076, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 1.498136748090697, "language_loss": 0.77474028, "learning_rate": 1.630767691572943e-07, "loss": 0.7961486, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.630840539932251 }, { "auxiliary_loss_clip": 0.01037134, "auxiliary_loss_mlp": 0.01001892, "balance_loss_clip": 1.00822568, "balance_loss_mlp": 1.00072408, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 1.0097073822294282, "language_loss": 0.53512752, "learning_rate": 1.6276881731939306e-07, "loss": 0.55551779, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.1850881576538086 }, { "auxiliary_loss_clip": 0.01146517, "auxiliary_loss_mlp": 0.01020397, "balance_loss_clip": 1.04580176, "balance_loss_mlp": 1.0138495, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 1.9932146921174259, "language_loss": 0.75439781, "learning_rate": 1.6246114418800193e-07, "loss": 0.77606696, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 2.5598864555358887 }, { "auxiliary_loss_clip": 0.0114321, "auxiliary_loss_mlp": 0.01026073, "balance_loss_clip": 1.0448997, "balance_loss_mlp": 1.01889062, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.6832415177500095, "language_loss": 0.76897204, "learning_rate": 1.6215374980979423e-07, "loss": 0.79066479, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 3.311440944671631 }, { "auxiliary_loss_clip": 0.01144712, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.04730201, "balance_loss_mlp": 1.01895833, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 1.9649667744685888, "language_loss": 0.68826187, "learning_rate": 1.6184663423140133e-07, "loss": 0.70996791, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.6920692920684814 }, { "auxiliary_loss_clip": 0.01110997, "auxiliary_loss_mlp": 0.01031442, "balance_loss_clip": 1.04253948, "balance_loss_mlp": 1.02467084, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 1.933814001514685, "language_loss": 0.64145964, "learning_rate": 1.615397974994126e-07, "loss": 0.662884, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.5908565521240234 }, { "auxiliary_loss_clip": 0.01160331, "auxiliary_loss_mlp": 0.01020217, "balance_loss_clip": 1.04577363, "balance_loss_mlp": 1.01364267, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 2.2644843306513143, "language_loss": 0.80635691, "learning_rate": 1.6123323966037438e-07, "loss": 0.82816237, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.4663772583007812 }, { "auxiliary_loss_clip": 0.01162536, "auxiliary_loss_mlp": 0.0102438, "balance_loss_clip": 1.04779434, "balance_loss_mlp": 1.01743031, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 2.073633729710139, "language_loss": 0.78626877, "learning_rate": 1.6092696076079216e-07, "loss": 0.80813795, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.469346523284912 }, { "auxiliary_loss_clip": 0.01108096, "auxiliary_loss_mlp": 0.01021204, "balance_loss_clip": 1.03960109, "balance_loss_mlp": 1.01481736, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 1.6519055302722412, "language_loss": 0.73761839, "learning_rate": 1.6062096084712785e-07, "loss": 0.75891137, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.6007862091064453 }, { "auxiliary_loss_clip": 0.01125058, "auxiliary_loss_mlp": 0.0076093, "balance_loss_clip": 1.04037356, "balance_loss_mlp": 1.00029039, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 1.7095591901105083, "language_loss": 0.70159721, "learning_rate": 1.6031523996580098e-07, "loss": 0.72045708, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.5322630405426025 }, { "auxiliary_loss_clip": 0.01130045, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.04452038, "balance_loss_mlp": 1.02176499, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 1.9069893472463315, "language_loss": 0.66435218, "learning_rate": 1.6000979816318981e-07, "loss": 0.68593532, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 2.527578830718994 }, { "auxiliary_loss_clip": 0.01145394, "auxiliary_loss_mlp": 0.01028544, "balance_loss_clip": 1.04663348, "balance_loss_mlp": 1.02151632, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 2.2406820481520495, "language_loss": 0.74770451, "learning_rate": 1.5970463548562886e-07, "loss": 0.76944387, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.4771006107330322 }, { "auxiliary_loss_clip": 0.01133772, "auxiliary_loss_mlp": 0.01026426, "balance_loss_clip": 1.04506874, "balance_loss_mlp": 1.01986623, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.8291737100528047, "language_loss": 0.71282876, "learning_rate": 1.5939975197941192e-07, "loss": 0.73443067, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.5453524589538574 }, { "auxiliary_loss_clip": 0.01037372, "auxiliary_loss_mlp": 0.00999912, "balance_loss_clip": 1.00897384, "balance_loss_mlp": 0.99881554, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.8130133828426268, "language_loss": 0.53437507, "learning_rate": 1.5909514769078892e-07, "loss": 0.55474788, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 4.748593330383301 }, { "auxiliary_loss_clip": 0.01110934, "auxiliary_loss_mlp": 0.01026439, "balance_loss_clip": 1.04432464, "balance_loss_mlp": 1.01989472, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 1.5058453695565943, "language_loss": 0.77789986, "learning_rate": 1.5879082266596867e-07, "loss": 0.79927361, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.5773732662200928 }, { "auxiliary_loss_clip": 0.01126697, "auxiliary_loss_mlp": 0.01023593, "balance_loss_clip": 1.0377233, "balance_loss_mlp": 1.01657736, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 2.0585478556652306, "language_loss": 0.71847177, "learning_rate": 1.5848677695111645e-07, "loss": 0.73997462, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 3.2920982837677 }, { "auxiliary_loss_clip": 0.01130748, "auxiliary_loss_mlp": 0.01026654, "balance_loss_clip": 1.04547107, "balance_loss_mlp": 1.01922154, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 2.641360880554864, "language_loss": 0.70263755, "learning_rate": 1.5818301059235562e-07, "loss": 0.72421157, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.533047914505005 }, { "auxiliary_loss_clip": 0.01138436, "auxiliary_loss_mlp": 0.01028455, "balance_loss_clip": 1.04645872, "balance_loss_mlp": 1.02117157, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 1.7257988638366073, "language_loss": 0.81458044, "learning_rate": 1.578795236357684e-07, "loss": 0.83624935, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 2.574993848800659 }, { "auxiliary_loss_clip": 0.01136731, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.04705048, "balance_loss_mlp": 1.02455091, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 1.9935633760287863, "language_loss": 0.85524219, "learning_rate": 1.5757631612739218e-07, "loss": 0.87692046, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.5234549045562744 }, { "auxiliary_loss_clip": 0.01054949, "auxiliary_loss_mlp": 0.01001834, "balance_loss_clip": 1.0077523, "balance_loss_mlp": 1.00073147, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.7818937620509236, "language_loss": 0.6145612, "learning_rate": 1.572733881132242e-07, "loss": 0.63512909, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 3.114102840423584 }, { "auxiliary_loss_clip": 0.01021873, "auxiliary_loss_mlp": 0.0100195, "balance_loss_clip": 1.00959802, "balance_loss_mlp": 1.00088918, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.785110947082407, "language_loss": 0.58556604, "learning_rate": 1.5697073963921814e-07, "loss": 0.60580426, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.0806939601898193 }, { "auxiliary_loss_clip": 0.01151391, "auxiliary_loss_mlp": 0.0102772, "balance_loss_clip": 1.04738998, "balance_loss_mlp": 1.02050185, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 2.196011523159466, "language_loss": 0.84755874, "learning_rate": 1.566683707512857e-07, "loss": 0.86934984, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.4787776470184326 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.01026241, "balance_loss_clip": 1.0433445, "balance_loss_mlp": 1.01863003, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 2.8567140703510048, "language_loss": 0.79494619, "learning_rate": 1.5636628149529553e-07, "loss": 0.81652939, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.5008068084716797 }, { "auxiliary_loss_clip": 0.01131987, "auxiliary_loss_mlp": 0.01023761, "balance_loss_clip": 1.04275203, "balance_loss_mlp": 1.01685858, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 2.122648365650335, "language_loss": 0.79613316, "learning_rate": 1.560644719170743e-07, "loss": 0.81769061, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.6019372940063477 }, { "auxiliary_loss_clip": 0.0111842, "auxiliary_loss_mlp": 0.01023824, "balance_loss_clip": 1.04065895, "balance_loss_mlp": 1.0167253, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 3.701098387421143, "language_loss": 0.72053802, "learning_rate": 1.5576294206240692e-07, "loss": 0.74196041, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.671232223510742 }, { "auxiliary_loss_clip": 0.01130549, "auxiliary_loss_mlp": 0.01026943, "balance_loss_clip": 1.04387164, "balance_loss_mlp": 1.01963854, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.705394105452408, "language_loss": 0.67963529, "learning_rate": 1.5546169197703507e-07, "loss": 0.7012102, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 2.8512649536132812 }, { "auxiliary_loss_clip": 0.01136494, "auxiliary_loss_mlp": 0.0102368, "balance_loss_clip": 1.04106116, "balance_loss_mlp": 1.01674235, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.439962239826139, "language_loss": 0.77431023, "learning_rate": 1.5516072170665774e-07, "loss": 0.79591197, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.540191650390625 }, { "auxiliary_loss_clip": 0.01151287, "auxiliary_loss_mlp": 0.01028742, "balance_loss_clip": 1.04690433, "balance_loss_mlp": 1.02229011, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 1.7875150525199515, "language_loss": 0.86916173, "learning_rate": 1.5486003129693214e-07, "loss": 0.890962, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.475304365158081 }, { "auxiliary_loss_clip": 0.01152174, "auxiliary_loss_mlp": 0.01028693, "balance_loss_clip": 1.04714346, "balance_loss_mlp": 1.02168941, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 2.6655416377275216, "language_loss": 0.7793721, "learning_rate": 1.545596207934725e-07, "loss": 0.80118072, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 3.2360286712646484 }, { "auxiliary_loss_clip": 0.01128124, "auxiliary_loss_mlp": 0.0102841, "balance_loss_clip": 1.04224193, "balance_loss_mlp": 1.02180922, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 1.6754033726673363, "language_loss": 0.77737141, "learning_rate": 1.5425949024185147e-07, "loss": 0.79893672, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 2.535475254058838 }, { "auxiliary_loss_clip": 0.01134381, "auxiliary_loss_mlp": 0.01026536, "balance_loss_clip": 1.04126191, "balance_loss_mlp": 1.02007151, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 2.5140450667222, "language_loss": 0.67463326, "learning_rate": 1.5395963968759818e-07, "loss": 0.69624245, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.529075860977173 }, { "auxiliary_loss_clip": 0.01132239, "auxiliary_loss_mlp": 0.01022995, "balance_loss_clip": 1.04041219, "balance_loss_mlp": 1.01619411, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 1.6287607465800513, "language_loss": 0.64327806, "learning_rate": 1.536600691761998e-07, "loss": 0.66483045, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 2.885249614715576 }, { "auxiliary_loss_clip": 0.01123448, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.04446232, "balance_loss_mlp": 1.02451575, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 3.9522549549295474, "language_loss": 0.71360546, "learning_rate": 1.5336077875310084e-07, "loss": 0.7351501, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.5707271099090576 }, { "auxiliary_loss_clip": 0.01111218, "auxiliary_loss_mlp": 0.01024384, "balance_loss_clip": 1.04138386, "balance_loss_mlp": 1.01783919, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 2.0386429423878707, "language_loss": 0.73834884, "learning_rate": 1.5306176846370321e-07, "loss": 0.75970483, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.557757616043091 }, { "auxiliary_loss_clip": 0.011393, "auxiliary_loss_mlp": 0.01023961, "balance_loss_clip": 1.04238963, "balance_loss_mlp": 1.01698732, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 2.2803253980253846, "language_loss": 0.74224722, "learning_rate": 1.5276303835336712e-07, "loss": 0.7638799, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.5732719898223877 }, { "auxiliary_loss_clip": 0.01046401, "auxiliary_loss_mlp": 0.01001765, "balance_loss_clip": 1.00811589, "balance_loss_mlp": 1.00065076, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.7588595532118178, "language_loss": 0.53583342, "learning_rate": 1.524645884674094e-07, "loss": 0.55631512, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.087132453918457 }, { "auxiliary_loss_clip": 0.01166762, "auxiliary_loss_mlp": 0.00761619, "balance_loss_clip": 1.04889488, "balance_loss_mlp": 1.00029969, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 2.228070257230499, "language_loss": 0.78813982, "learning_rate": 1.521664188511047e-07, "loss": 0.80742365, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 2.4924256801605225 }, { "auxiliary_loss_clip": 0.01136425, "auxiliary_loss_mlp": 0.00760642, "balance_loss_clip": 1.04763997, "balance_loss_mlp": 1.00030541, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 2.644741572598983, "language_loss": 0.80026078, "learning_rate": 1.518685295496851e-07, "loss": 0.81923151, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.5555264949798584 }, { "auxiliary_loss_clip": 0.0114717, "auxiliary_loss_mlp": 0.01021056, "balance_loss_clip": 1.04284692, "balance_loss_mlp": 1.01485062, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.6320634304775283, "language_loss": 0.8519001, "learning_rate": 1.5157092060833975e-07, "loss": 0.87358236, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.5036461353302 }, { "auxiliary_loss_clip": 0.01130123, "auxiliary_loss_mlp": 0.01026148, "balance_loss_clip": 1.04393566, "balance_loss_mlp": 1.01952553, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 1.5942504863525593, "language_loss": 0.65886426, "learning_rate": 1.5127359207221658e-07, "loss": 0.68042696, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 4.176614046096802 }, { "auxiliary_loss_clip": 0.01079937, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.03321862, "balance_loss_mlp": 1.02026165, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 1.875180700518652, "language_loss": 0.7339679, "learning_rate": 1.5097654398641923e-07, "loss": 0.75504732, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 2.603555679321289 }, { "auxiliary_loss_clip": 0.01151224, "auxiliary_loss_mlp": 0.01028652, "balance_loss_clip": 1.04686475, "balance_loss_mlp": 1.02225065, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.3967297493337751, "language_loss": 0.73043603, "learning_rate": 1.5067977639601014e-07, "loss": 0.75223482, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 3.279355764389038 }, { "auxiliary_loss_clip": 0.01130936, "auxiliary_loss_mlp": 0.01020292, "balance_loss_clip": 1.04373348, "balance_loss_mlp": 1.01410484, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 2.0413218217624527, "language_loss": 0.70648813, "learning_rate": 1.5038328934600864e-07, "loss": 0.7280004, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 2.4867377281188965 }, { "auxiliary_loss_clip": 0.01134447, "auxiliary_loss_mlp": 0.01025711, "balance_loss_clip": 1.0452311, "balance_loss_mlp": 1.01901722, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 1.9543542302356374, "language_loss": 0.69627976, "learning_rate": 1.5008708288139161e-07, "loss": 0.71788132, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.677274703979492 }, { "auxiliary_loss_clip": 0.01146673, "auxiliary_loss_mlp": 0.01024457, "balance_loss_clip": 1.04635084, "balance_loss_mlp": 1.01726866, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 2.5716493356786745, "language_loss": 0.73158562, "learning_rate": 1.497911570470931e-07, "loss": 0.75329691, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.5011093616485596 }, { "auxiliary_loss_clip": 0.01110968, "auxiliary_loss_mlp": 0.01023288, "balance_loss_clip": 1.04251266, "balance_loss_mlp": 1.01626062, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.6101021079444675, "language_loss": 0.85820913, "learning_rate": 1.494955118880048e-07, "loss": 0.87955177, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 2.621006488800049 }, { "auxiliary_loss_clip": 0.01148474, "auxiliary_loss_mlp": 0.01025003, "balance_loss_clip": 1.04508233, "balance_loss_mlp": 1.01808858, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.7523627014992615, "language_loss": 0.72563553, "learning_rate": 1.4920014744897634e-07, "loss": 0.74737024, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.509491205215454 }, { "auxiliary_loss_clip": 0.01126111, "auxiliary_loss_mlp": 0.01031995, "balance_loss_clip": 1.0434339, "balance_loss_mlp": 1.02503633, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 1.6560313201568533, "language_loss": 0.86032712, "learning_rate": 1.4890506377481392e-07, "loss": 0.88190824, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.562244176864624 }, { "auxiliary_loss_clip": 0.01086751, "auxiliary_loss_mlp": 0.01023624, "balance_loss_clip": 1.03943181, "balance_loss_mlp": 1.01763976, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.4042037413526858, "language_loss": 0.63861346, "learning_rate": 1.486102609102815e-07, "loss": 0.6597172, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.628953218460083 }, { "auxiliary_loss_clip": 0.01130155, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.04420698, "balance_loss_mlp": 1.02042103, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 2.608853108470578, "language_loss": 0.8571862, "learning_rate": 1.483157389001004e-07, "loss": 0.87875605, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.49715256690979 }, { "auxiliary_loss_clip": 0.01133187, "auxiliary_loss_mlp": 0.01028905, "balance_loss_clip": 1.04163742, "balance_loss_mlp": 1.02169561, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 4.746455685420052, "language_loss": 0.78898871, "learning_rate": 1.4802149778894933e-07, "loss": 0.81060958, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.5328519344329834 }, { "auxiliary_loss_clip": 0.01139603, "auxiliary_loss_mlp": 0.01024107, "balance_loss_clip": 1.0412612, "balance_loss_mlp": 1.01769984, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 1.8254619345139635, "language_loss": 0.87679756, "learning_rate": 1.4772753762146484e-07, "loss": 0.89843464, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.4870591163635254 }, { "auxiliary_loss_clip": 0.01141018, "auxiliary_loss_mlp": 0.01027423, "balance_loss_clip": 1.04275358, "balance_loss_mlp": 1.02038705, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 1.7526352216504806, "language_loss": 0.70505375, "learning_rate": 1.474338584422401e-07, "loss": 0.72673815, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.6090919971466064 }, { "auxiliary_loss_clip": 0.0114664, "auxiliary_loss_mlp": 0.01023081, "balance_loss_clip": 1.04683387, "balance_loss_mlp": 1.01624727, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 1.626497069638088, "language_loss": 0.75914007, "learning_rate": 1.4714046029582595e-07, "loss": 0.7808373, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.5231151580810547 }, { "auxiliary_loss_clip": 0.01123448, "auxiliary_loss_mlp": 0.0102056, "balance_loss_clip": 1.04172552, "balance_loss_mlp": 1.01374125, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.6735718278256764, "language_loss": 0.75846261, "learning_rate": 1.46847343226731e-07, "loss": 0.7799027, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 3.441106081008911 }, { "auxiliary_loss_clip": 0.01153632, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.04739606, "balance_loss_mlp": 1.02417111, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 1.8671867086991916, "language_loss": 0.69585323, "learning_rate": 1.465545072794203e-07, "loss": 0.71770072, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 2.4602608680725098 }, { "auxiliary_loss_clip": 0.01098591, "auxiliary_loss_mlp": 0.01023071, "balance_loss_clip": 1.04326129, "balance_loss_mlp": 1.01655972, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 1.5449566819734843, "language_loss": 0.75728607, "learning_rate": 1.4626195249831774e-07, "loss": 0.77850276, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.6025633811950684 }, { "auxiliary_loss_clip": 0.0114608, "auxiliary_loss_mlp": 0.01026724, "balance_loss_clip": 1.04378355, "balance_loss_mlp": 1.01986611, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 1.7666090464176314, "language_loss": 0.71985507, "learning_rate": 1.4596967892780244e-07, "loss": 0.74158311, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 2.4452786445617676 }, { "auxiliary_loss_clip": 0.01164007, "auxiliary_loss_mlp": 0.01028628, "balance_loss_clip": 1.0489583, "balance_loss_mlp": 1.02167773, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 1.656981427546696, "language_loss": 0.74887705, "learning_rate": 1.4567768661221314e-07, "loss": 0.77080339, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.4883689880371094 }, { "auxiliary_loss_clip": 0.01154075, "auxiliary_loss_mlp": 0.00760974, "balance_loss_clip": 1.04746878, "balance_loss_mlp": 1.00027955, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 1.8672902227794987, "language_loss": 0.74125326, "learning_rate": 1.4538597559584442e-07, "loss": 0.76040375, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.5114455223083496 }, { "auxiliary_loss_clip": 0.01131464, "auxiliary_loss_mlp": 0.01030845, "balance_loss_clip": 1.04435158, "balance_loss_mlp": 1.02398419, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 1.9192033410166822, "language_loss": 0.78914511, "learning_rate": 1.4509454592294823e-07, "loss": 0.81076819, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.535228729248047 }, { "auxiliary_loss_clip": 0.01121223, "auxiliary_loss_mlp": 0.00760688, "balance_loss_clip": 1.04388905, "balance_loss_mlp": 1.00031328, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 1.8987620234170015, "language_loss": 0.79005396, "learning_rate": 1.448033976377354e-07, "loss": 0.808873, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.526254653930664 }, { "auxiliary_loss_clip": 0.0114954, "auxiliary_loss_mlp": 0.01026404, "balance_loss_clip": 1.04388452, "balance_loss_mlp": 1.01962638, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 1.9585159626882462, "language_loss": 0.74066234, "learning_rate": 1.445125307843713e-07, "loss": 0.76242179, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.486567497253418 }, { "auxiliary_loss_clip": 0.01145798, "auxiliary_loss_mlp": 0.01026963, "balance_loss_clip": 1.04640961, "balance_loss_mlp": 1.02029884, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 1.6689588857647721, "language_loss": 0.75491405, "learning_rate": 1.442219454069813e-07, "loss": 0.77664173, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.539588451385498 }, { "auxiliary_loss_clip": 0.01110073, "auxiliary_loss_mlp": 0.01023698, "balance_loss_clip": 1.0416193, "balance_loss_mlp": 1.01728153, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 1.7777051803667316, "language_loss": 0.66295779, "learning_rate": 1.4393164154964676e-07, "loss": 0.68429554, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.5931131839752197 }, { "auxiliary_loss_clip": 0.01144744, "auxiliary_loss_mlp": 0.01024014, "balance_loss_clip": 1.04592383, "balance_loss_mlp": 1.01799345, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 1.6999958714610666, "language_loss": 0.93961334, "learning_rate": 1.4364161925640649e-07, "loss": 0.96130091, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 4.095608472824097 }, { "auxiliary_loss_clip": 0.01163246, "auxiliary_loss_mlp": 0.01020944, "balance_loss_clip": 1.0478276, "balance_loss_mlp": 1.01453948, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 1.8294444673109223, "language_loss": 0.85090828, "learning_rate": 1.4335187857125663e-07, "loss": 0.87275016, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 2.448507070541382 }, { "auxiliary_loss_clip": 0.01152192, "auxiliary_loss_mlp": 0.01024869, "balance_loss_clip": 1.04668427, "balance_loss_mlp": 1.01825547, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 1.7269263415583849, "language_loss": 0.75605804, "learning_rate": 1.4306241953815023e-07, "loss": 0.77782863, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 3.3029303550720215 }, { "auxiliary_loss_clip": 0.01148819, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.04492688, "balance_loss_mlp": 1.02243829, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 1.8557441176959373, "language_loss": 0.70574415, "learning_rate": 1.4277324220099862e-07, "loss": 0.72752905, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 2.541755199432373 }, { "auxiliary_loss_clip": 0.01114627, "auxiliary_loss_mlp": 0.0102882, "balance_loss_clip": 1.03919482, "balance_loss_mlp": 1.02201319, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 1.747398887515792, "language_loss": 0.73767161, "learning_rate": 1.4248434660366938e-07, "loss": 0.7591061, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.552534818649292 }, { "auxiliary_loss_clip": 0.01136443, "auxiliary_loss_mlp": 0.01025697, "balance_loss_clip": 1.04609585, "balance_loss_mlp": 1.01908731, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 1.7630697276135998, "language_loss": 0.70652723, "learning_rate": 1.4219573278998808e-07, "loss": 0.72814858, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.518187999725342 }, { "auxiliary_loss_clip": 0.01131631, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.04168737, "balance_loss_mlp": 1.02079821, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 2.21319245036463, "language_loss": 0.6453948, "learning_rate": 1.4190740080373685e-07, "loss": 0.6669935, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 2.662992477416992 }, { "auxiliary_loss_clip": 0.01107075, "auxiliary_loss_mlp": 0.01027694, "balance_loss_clip": 1.04293728, "balance_loss_mlp": 1.02068162, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 1.9128470448761485, "language_loss": 0.8428821, "learning_rate": 1.4161935068865538e-07, "loss": 0.8642298, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.5572757720947266 }, { "auxiliary_loss_clip": 0.01163098, "auxiliary_loss_mlp": 0.01023069, "balance_loss_clip": 1.04748154, "balance_loss_mlp": 1.01597643, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 1.9884806716925736, "language_loss": 0.75598997, "learning_rate": 1.4133158248844113e-07, "loss": 0.77785164, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.451802968978882 }, { "auxiliary_loss_clip": 0.01121531, "auxiliary_loss_mlp": 0.01026553, "balance_loss_clip": 1.04423833, "balance_loss_mlp": 1.0192039, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 1.9815866041175771, "language_loss": 0.7358411, "learning_rate": 1.4104409624674785e-07, "loss": 0.75732195, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.5765669345855713 }, { "auxiliary_loss_clip": 0.01150279, "auxiliary_loss_mlp": 0.01025849, "balance_loss_clip": 1.04803395, "balance_loss_mlp": 1.01897049, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 1.6847689682890183, "language_loss": 0.78214753, "learning_rate": 1.407568920071873e-07, "loss": 0.80390882, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.5381319522857666 }, { "auxiliary_loss_clip": 0.01167442, "auxiliary_loss_mlp": 0.01025769, "balance_loss_clip": 1.04882574, "balance_loss_mlp": 1.01834261, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 2.321426649769578, "language_loss": 0.68081951, "learning_rate": 1.4046996981332782e-07, "loss": 0.7027517, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.5215439796447754 }, { "auxiliary_loss_clip": 0.01121036, "auxiliary_loss_mlp": 0.01027281, "balance_loss_clip": 1.04358435, "balance_loss_mlp": 1.02011085, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 1.7675047709782992, "language_loss": 0.78017128, "learning_rate": 1.4018332970869516e-07, "loss": 0.80165446, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.583707571029663 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.01025604, "balance_loss_clip": 1.04228497, "balance_loss_mlp": 1.01844525, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 1.9032862358436824, "language_loss": 0.84903371, "learning_rate": 1.398969717367733e-07, "loss": 0.87055075, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 2.5595128536224365 }, { "auxiliary_loss_clip": 0.01105732, "auxiliary_loss_mlp": 0.01026767, "balance_loss_clip": 1.04614639, "balance_loss_mlp": 1.02020407, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 1.6291694021849228, "language_loss": 0.7616117, "learning_rate": 1.396108959410014e-07, "loss": 0.78293669, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.562119960784912 }, { "auxiliary_loss_clip": 0.01147806, "auxiliary_loss_mlp": 0.00761121, "balance_loss_clip": 1.04664159, "balance_loss_mlp": 1.00028276, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 1.5707513263604735, "language_loss": 0.81177318, "learning_rate": 1.3932510236477745e-07, "loss": 0.83086246, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 3.251771926879883 }, { "auxiliary_loss_clip": 0.01147163, "auxiliary_loss_mlp": 0.0102789, "balance_loss_clip": 1.04332614, "balance_loss_mlp": 1.02079654, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 1.6565967636463592, "language_loss": 0.55732012, "learning_rate": 1.3903959105145636e-07, "loss": 0.57907069, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 2.550734281539917 }, { "auxiliary_loss_clip": 0.01163297, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.0483191, "balance_loss_mlp": 1.0230248, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 1.8964161696598811, "language_loss": 0.82971114, "learning_rate": 1.387543620443492e-07, "loss": 0.85164034, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.4645819664001465 }, { "auxiliary_loss_clip": 0.01161973, "auxiliary_loss_mlp": 0.01021499, "balance_loss_clip": 1.04799008, "balance_loss_mlp": 1.01476407, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 1.7537293080966447, "language_loss": 0.84263122, "learning_rate": 1.3846941538672606e-07, "loss": 0.86446595, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.4901413917541504 }, { "auxiliary_loss_clip": 0.01112876, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.04501057, "balance_loss_mlp": 1.0227139, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.353654000184152, "language_loss": 0.81194133, "learning_rate": 1.3818475112181193e-07, "loss": 0.83336884, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 2.646470308303833 }, { "auxiliary_loss_clip": 0.01131635, "auxiliary_loss_mlp": 0.01021357, "balance_loss_clip": 1.04525256, "balance_loss_mlp": 1.01523304, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 1.9190170150527706, "language_loss": 0.79512346, "learning_rate": 1.3790036929279091e-07, "loss": 0.81665337, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.5092339515686035 }, { "auxiliary_loss_clip": 0.01153483, "auxiliary_loss_mlp": 0.0076108, "balance_loss_clip": 1.04823232, "balance_loss_mlp": 1.00034904, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.103822450582153, "language_loss": 0.58460033, "learning_rate": 1.3761626994280363e-07, "loss": 0.60374594, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 2.480623722076416 }, { "auxiliary_loss_clip": 0.01123239, "auxiliary_loss_mlp": 0.01023023, "balance_loss_clip": 1.04128671, "balance_loss_mlp": 1.01642513, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 1.6916460702151148, "language_loss": 0.73470736, "learning_rate": 1.3733245311494735e-07, "loss": 0.75617003, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 2.6984171867370605 }, { "auxiliary_loss_clip": 0.01148963, "auxiliary_loss_mlp": 0.01025351, "balance_loss_clip": 1.04602134, "balance_loss_mlp": 1.01820421, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 2.4818108571349256, "language_loss": 0.7062704, "learning_rate": 1.3704891885227676e-07, "loss": 0.72801352, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.5198798179626465 }, { "auxiliary_loss_clip": 0.01118148, "auxiliary_loss_mlp": 0.01027542, "balance_loss_clip": 1.03866065, "balance_loss_mlp": 1.02041876, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 2.0336068332147015, "language_loss": 0.77887738, "learning_rate": 1.367656671978037e-07, "loss": 0.80033433, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 2.5557496547698975 }, { "auxiliary_loss_clip": 0.01138313, "auxiliary_loss_mlp": 0.01025114, "balance_loss_clip": 1.04252517, "balance_loss_mlp": 1.01867104, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 2.496801702282319, "language_loss": 0.73762441, "learning_rate": 1.36482698194498e-07, "loss": 0.75925869, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 3.2355852127075195 }, { "auxiliary_loss_clip": 0.01136212, "auxiliary_loss_mlp": 0.01028742, "balance_loss_clip": 1.04363954, "balance_loss_mlp": 1.02126729, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 1.7319929563162109, "language_loss": 0.71714276, "learning_rate": 1.3620001188528506e-07, "loss": 0.7387923, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 2.5511672496795654 }, { "auxiliary_loss_clip": 0.01150313, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.04491985, "balance_loss_mlp": 1.0216217, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 6.542907712405155, "language_loss": 0.7352742, "learning_rate": 1.3591760831304865e-07, "loss": 0.7570647, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 3.3268024921417236 }, { "auxiliary_loss_clip": 0.011618, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.04669142, "balance_loss_mlp": 1.02354312, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 1.673553517308425, "language_loss": 0.79280424, "learning_rate": 1.356354875206287e-07, "loss": 0.8147319, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 3.187958240509033 }, { "auxiliary_loss_clip": 0.011205, "auxiliary_loss_mlp": 0.0102693, "balance_loss_clip": 1.04521346, "balance_loss_mlp": 1.01975632, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 1.8966104663288512, "language_loss": 0.69828367, "learning_rate": 1.3535364955082296e-07, "loss": 0.71975797, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.598649740219116 }, { "auxiliary_loss_clip": 0.01160573, "auxiliary_loss_mlp": 0.01023878, "balance_loss_clip": 1.04672301, "balance_loss_mlp": 1.01757455, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 1.8056448074686748, "language_loss": 0.64303648, "learning_rate": 1.3507209444638613e-07, "loss": 0.66488099, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 2.504650354385376 }, { "auxiliary_loss_clip": 0.0114933, "auxiliary_loss_mlp": 0.010276, "balance_loss_clip": 1.04592288, "balance_loss_mlp": 1.02076888, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 1.8402854358864091, "language_loss": 0.73796588, "learning_rate": 1.347908222500298e-07, "loss": 0.75973523, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.4949305057525635 }, { "auxiliary_loss_clip": 0.01107113, "auxiliary_loss_mlp": 0.01021209, "balance_loss_clip": 1.04102755, "balance_loss_mlp": 1.01442909, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 1.8947453828312526, "language_loss": 0.69706559, "learning_rate": 1.3450983300442276e-07, "loss": 0.71834886, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.5385656356811523 }, { "auxiliary_loss_clip": 0.01153072, "auxiliary_loss_mlp": 0.01027591, "balance_loss_clip": 1.04765272, "balance_loss_mlp": 1.02086115, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 1.8269480298829808, "language_loss": 0.73396206, "learning_rate": 1.3422912675219068e-07, "loss": 0.75576872, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.53542160987854 }, { "auxiliary_loss_clip": 0.01159309, "auxiliary_loss_mlp": 0.0102542, "balance_loss_clip": 1.04656386, "balance_loss_mlp": 1.01877093, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.5070411842244116, "language_loss": 0.78914177, "learning_rate": 1.339487035359166e-07, "loss": 0.81098902, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.501157522201538 }, { "auxiliary_loss_clip": 0.01138981, "auxiliary_loss_mlp": 0.00759813, "balance_loss_clip": 1.04781425, "balance_loss_mlp": 1.00035965, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 1.5255304552770708, "language_loss": 0.84975827, "learning_rate": 1.336685633981409e-07, "loss": 0.86874622, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.552380323410034 }, { "auxiliary_loss_clip": 0.01148912, "auxiliary_loss_mlp": 0.01025314, "balance_loss_clip": 1.0439707, "balance_loss_mlp": 1.01869464, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 1.7667924984640728, "language_loss": 0.74797893, "learning_rate": 1.333887063813597e-07, "loss": 0.76972121, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.492511749267578 }, { "auxiliary_loss_clip": 0.01136233, "auxiliary_loss_mlp": 0.01025417, "balance_loss_clip": 1.04292011, "balance_loss_mlp": 1.01895869, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 1.8519643705507336, "language_loss": 0.66675442, "learning_rate": 1.331091325280278e-07, "loss": 0.68837094, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.510791301727295 }, { "auxiliary_loss_clip": 0.01100482, "auxiliary_loss_mlp": 0.01026336, "balance_loss_clip": 1.03991127, "balance_loss_mlp": 1.01876664, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 3.947824893526396, "language_loss": 0.78547907, "learning_rate": 1.3282984188055625e-07, "loss": 0.80674732, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.6000115871429443 }, { "auxiliary_loss_clip": 0.0116162, "auxiliary_loss_mlp": 0.01023174, "balance_loss_clip": 1.04643416, "balance_loss_mlp": 1.01603353, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 2.056981995635614, "language_loss": 0.79639435, "learning_rate": 1.3255083448131288e-07, "loss": 0.81824231, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.468514919281006 }, { "auxiliary_loss_clip": 0.01153826, "auxiliary_loss_mlp": 0.01023867, "balance_loss_clip": 1.04635096, "balance_loss_mlp": 1.0168097, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 2.051706950523332, "language_loss": 0.79144704, "learning_rate": 1.3227211037262365e-07, "loss": 0.81322396, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 2.4925196170806885 }, { "auxiliary_loss_clip": 0.01106324, "auxiliary_loss_mlp": 0.01026293, "balance_loss_clip": 1.0411036, "balance_loss_mlp": 1.0193013, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 2.2896742697620596, "language_loss": 0.85037845, "learning_rate": 1.319936695967696e-07, "loss": 0.87170464, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 3.3878891468048096 }, { "auxiliary_loss_clip": 0.01168705, "auxiliary_loss_mlp": 0.01025512, "balance_loss_clip": 1.04749191, "balance_loss_mlp": 1.01831126, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.28804989655411, "language_loss": 0.82376409, "learning_rate": 1.3171551219599097e-07, "loss": 0.84570622, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.465461254119873 }, { "auxiliary_loss_clip": 0.01163571, "auxiliary_loss_mlp": 0.01029357, "balance_loss_clip": 1.04819334, "balance_loss_mlp": 1.02224648, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 2.1278482347027547, "language_loss": 0.78018987, "learning_rate": 1.3143763821248377e-07, "loss": 0.80211914, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.46467661857605 }, { "auxiliary_loss_clip": 0.01161501, "auxiliary_loss_mlp": 0.01026119, "balance_loss_clip": 1.04731095, "balance_loss_mlp": 1.01956558, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 1.757366773108, "language_loss": 0.72181118, "learning_rate": 1.3116004768840118e-07, "loss": 0.74368739, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.4442265033721924 }, { "auxiliary_loss_clip": 0.01164293, "auxiliary_loss_mlp": 0.01030167, "balance_loss_clip": 1.04834008, "balance_loss_mlp": 1.02225423, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.5771448108851112, "language_loss": 0.74344814, "learning_rate": 1.3088274066585348e-07, "loss": 0.76539278, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.449061155319214 }, { "auxiliary_loss_clip": 0.01125708, "auxiliary_loss_mlp": 0.01024447, "balance_loss_clip": 1.04239285, "balance_loss_mlp": 1.01778913, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 2.0557404573500357, "language_loss": 0.90528738, "learning_rate": 1.3060571718690749e-07, "loss": 0.92678893, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 2.580018997192383 }, { "auxiliary_loss_clip": 0.0102685, "auxiliary_loss_mlp": 0.00751233, "balance_loss_clip": 1.0080682, "balance_loss_mlp": 1.00024748, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.746934924187684, "language_loss": 0.56938475, "learning_rate": 1.3032897729358805e-07, "loss": 0.58716553, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 3.155881643295288 }, { "auxiliary_loss_clip": 0.0108379, "auxiliary_loss_mlp": 0.00761577, "balance_loss_clip": 1.03728628, "balance_loss_mlp": 1.00031304, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 2.2727654404173516, "language_loss": 0.79572964, "learning_rate": 1.3005252102787645e-07, "loss": 0.81418329, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 2.6646227836608887 }, { "auxiliary_loss_clip": 0.01153129, "auxiliary_loss_mlp": 0.01027769, "balance_loss_clip": 1.04690695, "balance_loss_mlp": 1.02093256, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.5645379472624619, "language_loss": 0.73570222, "learning_rate": 1.297763484317105e-07, "loss": 0.7575112, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.49581241607666 }, { "auxiliary_loss_clip": 0.01099854, "auxiliary_loss_mlp": 0.00761581, "balance_loss_clip": 1.03918254, "balance_loss_mlp": 1.00029051, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 2.0828391090877973, "language_loss": 0.70298672, "learning_rate": 1.2950045954698551e-07, "loss": 0.72160101, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 2.578559160232544 }, { "auxiliary_loss_clip": 0.01113617, "auxiliary_loss_mlp": 0.01024403, "balance_loss_clip": 1.04286075, "balance_loss_mlp": 1.01792097, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 1.5841152538447005, "language_loss": 0.75465393, "learning_rate": 1.2922485441555343e-07, "loss": 0.77603412, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 3.3947603702545166 }, { "auxiliary_loss_clip": 0.0116153, "auxiliary_loss_mlp": 0.01024315, "balance_loss_clip": 1.04593158, "balance_loss_mlp": 1.01813388, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 1.722132608100089, "language_loss": 0.81813335, "learning_rate": 1.2894953307922363e-07, "loss": 0.83999181, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.4670357704162598 }, { "auxiliary_loss_clip": 0.01116183, "auxiliary_loss_mlp": 0.01026269, "balance_loss_clip": 1.04351723, "balance_loss_mlp": 1.01891077, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 1.904915795204902, "language_loss": 0.84140193, "learning_rate": 1.2867449557976208e-07, "loss": 0.86282647, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 3.308199882507324 }, { "auxiliary_loss_clip": 0.01148953, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.04743493, "balance_loss_mlp": 1.02181578, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 2.475454977518529, "language_loss": 0.75669563, "learning_rate": 1.283997419588916e-07, "loss": 0.77846861, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 3.155033588409424 }, { "auxiliary_loss_clip": 0.01152029, "auxiliary_loss_mlp": 0.01025831, "balance_loss_clip": 1.04579449, "balance_loss_mlp": 1.01880932, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 2.1358218219947385, "language_loss": 0.61874312, "learning_rate": 1.2812527225829216e-07, "loss": 0.64052176, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.4807190895080566 }, { "auxiliary_loss_clip": 0.01157475, "auxiliary_loss_mlp": 0.01027721, "balance_loss_clip": 1.04877675, "balance_loss_mlp": 1.02017844, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 1.8629029828097068, "language_loss": 0.76669598, "learning_rate": 1.2785108651960052e-07, "loss": 0.78854793, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 2.527873992919922 }, { "auxiliary_loss_clip": 0.01150541, "auxiliary_loss_mlp": 0.01025819, "balance_loss_clip": 1.04360723, "balance_loss_mlp": 1.01839566, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 1.9930803251801332, "language_loss": 0.80737621, "learning_rate": 1.2757718478441094e-07, "loss": 0.82913983, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.5457708835601807 }, { "auxiliary_loss_clip": 0.01132686, "auxiliary_loss_mlp": 0.01025926, "balance_loss_clip": 1.04221559, "balance_loss_mlp": 1.0190264, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 1.9109857340534795, "language_loss": 0.77172261, "learning_rate": 1.2730356709427302e-07, "loss": 0.79330873, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.554243803024292 }, { "auxiliary_loss_clip": 0.01146875, "auxiliary_loss_mlp": 0.01024404, "balance_loss_clip": 1.04758501, "balance_loss_mlp": 1.0178709, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 1.6241878572554849, "language_loss": 0.59897184, "learning_rate": 1.2703023349069542e-07, "loss": 0.62068462, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.6725103855133057 }, { "auxiliary_loss_clip": 0.01144284, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.04452491, "balance_loss_mlp": 1.01715159, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 2.0022390176584115, "language_loss": 0.61713123, "learning_rate": 1.2675718401514223e-07, "loss": 0.63881493, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.6006500720977783 }, { "auxiliary_loss_clip": 0.01133954, "auxiliary_loss_mlp": 0.01025974, "balance_loss_clip": 1.04388475, "balance_loss_mlp": 1.01896715, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 3.7461030506979465, "language_loss": 0.74372351, "learning_rate": 1.264844187090346e-07, "loss": 0.7653228, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.512767791748047 }, { "auxiliary_loss_clip": 0.01129699, "auxiliary_loss_mlp": 0.01022794, "balance_loss_clip": 1.04250598, "balance_loss_mlp": 1.01593304, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 1.7541093110588333, "language_loss": 0.75031447, "learning_rate": 1.262119376137516e-07, "loss": 0.77183938, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.558415412902832 }, { "auxiliary_loss_clip": 0.011371, "auxiliary_loss_mlp": 0.01028719, "balance_loss_clip": 1.0417968, "balance_loss_mlp": 1.02180469, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 1.4715787350143474, "language_loss": 0.85056758, "learning_rate": 1.2593974077062707e-07, "loss": 0.87222576, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.538681983947754 }, { "auxiliary_loss_clip": 0.0111146, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.04126358, "balance_loss_mlp": 1.02356339, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 1.9622732756288723, "language_loss": 0.63728619, "learning_rate": 1.2566782822095423e-07, "loss": 0.65870285, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.613985538482666 }, { "auxiliary_loss_clip": 0.01126955, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.04613304, "balance_loss_mlp": 1.02249694, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 2.1168286665464535, "language_loss": 0.71203077, "learning_rate": 1.2539620000598162e-07, "loss": 0.73358834, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.576333522796631 }, { "auxiliary_loss_clip": 0.01160417, "auxiliary_loss_mlp": 0.0102638, "balance_loss_clip": 1.04513204, "balance_loss_mlp": 1.01913214, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 1.8496561117810337, "language_loss": 0.79724431, "learning_rate": 1.2512485616691492e-07, "loss": 0.8191123, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 3.1410610675811768 }, { "auxiliary_loss_clip": 0.01123477, "auxiliary_loss_mlp": 0.01030844, "balance_loss_clip": 1.04315186, "balance_loss_mlp": 1.02355146, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.3794865600093456, "language_loss": 0.80994177, "learning_rate": 1.2485379674491681e-07, "loss": 0.83148497, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.705625534057617 }, { "auxiliary_loss_clip": 0.01136268, "auxiliary_loss_mlp": 0.01025623, "balance_loss_clip": 1.04631126, "balance_loss_mlp": 1.01862001, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 2.128251317112393, "language_loss": 0.7882039, "learning_rate": 1.2458302178110657e-07, "loss": 0.8098228, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.4934699535369873 }, { "auxiliary_loss_clip": 0.0111287, "auxiliary_loss_mlp": 0.01024317, "balance_loss_clip": 1.04147315, "balance_loss_mlp": 1.01801646, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 1.8294227201459328, "language_loss": 0.82171118, "learning_rate": 1.2431253131656118e-07, "loss": 0.84308302, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.585108995437622 }, { "auxiliary_loss_clip": 0.01129008, "auxiliary_loss_mlp": 0.01028282, "balance_loss_clip": 1.04535151, "balance_loss_mlp": 1.02130497, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 1.7925640152392257, "language_loss": 0.76818728, "learning_rate": 1.240423253923133e-07, "loss": 0.78976017, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.536597728729248 }, { "auxiliary_loss_clip": 0.01150407, "auxiliary_loss_mlp": 0.01025501, "balance_loss_clip": 1.04546547, "balance_loss_mlp": 1.01847994, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 2.20307130548792, "language_loss": 0.69325078, "learning_rate": 1.237724040493533e-07, "loss": 0.71500993, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.4833898544311523 }, { "auxiliary_loss_clip": 0.01167063, "auxiliary_loss_mlp": 0.01028702, "balance_loss_clip": 1.04895508, "balance_loss_mlp": 1.02105141, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 2.865595984824438, "language_loss": 0.72897494, "learning_rate": 1.2350276732862773e-07, "loss": 0.75093257, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.477720260620117 }, { "auxiliary_loss_clip": 0.01046172, "auxiliary_loss_mlp": 0.01003259, "balance_loss_clip": 1.00811172, "balance_loss_mlp": 1.00220954, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.8373480587881046, "language_loss": 0.5672555, "learning_rate": 1.2323341527103993e-07, "loss": 0.58774984, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.007127285003662 }, { "auxiliary_loss_clip": 0.01163099, "auxiliary_loss_mlp": 0.01024288, "balance_loss_clip": 1.04729903, "balance_loss_mlp": 1.01757705, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 1.9842470986994716, "language_loss": 0.8505584, "learning_rate": 1.2296434791745135e-07, "loss": 0.87243229, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 2.496389865875244 }, { "auxiliary_loss_clip": 0.01150365, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.04617453, "balance_loss_mlp": 1.02178645, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 1.6041375959149036, "language_loss": 0.76385522, "learning_rate": 1.2269556530867875e-07, "loss": 0.78564703, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.478358030319214 }, { "auxiliary_loss_clip": 0.01168715, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.0491817, "balance_loss_mlp": 1.02406073, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 2.0281353569641163, "language_loss": 0.81472683, "learning_rate": 1.2242706748549614e-07, "loss": 0.83673024, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 2.4962806701660156 }, { "auxiliary_loss_clip": 0.01134037, "auxiliary_loss_mlp": 0.01023889, "balance_loss_clip": 1.04022205, "balance_loss_mlp": 1.01700449, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 2.6399933719858146, "language_loss": 0.82265896, "learning_rate": 1.2215885448863473e-07, "loss": 0.84423822, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 3.3771467208862305 }, { "auxiliary_loss_clip": 0.01134231, "auxiliary_loss_mlp": 0.01026934, "balance_loss_clip": 1.04533386, "balance_loss_mlp": 1.02024031, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 1.6638265038590265, "language_loss": 0.79944813, "learning_rate": 1.2189092635878152e-07, "loss": 0.8210597, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 3.3647654056549072 }, { "auxiliary_loss_clip": 0.01108925, "auxiliary_loss_mlp": 0.01024053, "balance_loss_clip": 1.04085612, "balance_loss_mlp": 1.01708865, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 1.7055034256665786, "language_loss": 0.77369088, "learning_rate": 1.216232831365822e-07, "loss": 0.7950207, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.5688788890838623 }, { "auxiliary_loss_clip": 0.0114272, "auxiliary_loss_mlp": 0.01032085, "balance_loss_clip": 1.04704022, "balance_loss_mlp": 1.02515602, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 2.008331427591492, "language_loss": 0.80988884, "learning_rate": 1.2135592486263678e-07, "loss": 0.83163685, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.5471713542938232 }, { "auxiliary_loss_clip": 0.01134281, "auxiliary_loss_mlp": 0.01025518, "balance_loss_clip": 1.04373598, "balance_loss_mlp": 1.01860356, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 1.5306463921455495, "language_loss": 0.6103965, "learning_rate": 1.2108885157750415e-07, "loss": 0.63199449, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 3.342547655105591 }, { "auxiliary_loss_clip": 0.01118443, "auxiliary_loss_mlp": 0.00760693, "balance_loss_clip": 1.04547417, "balance_loss_mlp": 1.00031996, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.6823357497303655, "language_loss": 0.80433202, "learning_rate": 1.2082206332169897e-07, "loss": 0.8231234, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 2.6026644706726074 }, { "auxiliary_loss_clip": 0.01132414, "auxiliary_loss_mlp": 0.01024381, "balance_loss_clip": 1.04629779, "balance_loss_mlp": 1.01738334, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 2.528356360430867, "language_loss": 0.73176026, "learning_rate": 1.2055556013569225e-07, "loss": 0.7533282, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.526818037033081 }, { "auxiliary_loss_clip": 0.01135826, "auxiliary_loss_mlp": 0.0102825, "balance_loss_clip": 1.04561555, "balance_loss_mlp": 1.02167892, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.5949630715450362, "language_loss": 0.82172787, "learning_rate": 1.2028934205991315e-07, "loss": 0.84336865, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.523819923400879 }, { "auxiliary_loss_clip": 0.01147816, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.04353619, "balance_loss_mlp": 1.0192436, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.3436758099633799, "language_loss": 0.76613557, "learning_rate": 1.2002340913474607e-07, "loss": 0.78787744, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.5745363235473633 }, { "auxiliary_loss_clip": 0.01166622, "auxiliary_loss_mlp": 0.01025032, "balance_loss_clip": 1.04903519, "balance_loss_mlp": 1.01762319, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 4.816366655085421, "language_loss": 0.74097252, "learning_rate": 1.1975776140053317e-07, "loss": 0.76288909, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.5222017765045166 }, { "auxiliary_loss_clip": 0.01109818, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.04213154, "balance_loss_mlp": 1.02021313, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 2.0255575629179003, "language_loss": 0.73527914, "learning_rate": 1.194923988975729e-07, "loss": 0.75665319, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.597662925720215 }, { "auxiliary_loss_clip": 0.0111811, "auxiliary_loss_mlp": 0.0102545, "balance_loss_clip": 1.04261017, "balance_loss_mlp": 1.01762354, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 2.1316676612759204, "language_loss": 0.73224986, "learning_rate": 1.192273216661206e-07, "loss": 0.75368547, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.512511730194092 }, { "auxiliary_loss_clip": 0.01003574, "auxiliary_loss_mlp": 0.0100238, "balance_loss_clip": 1.00683701, "balance_loss_mlp": 1.00131309, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.7688657710720881, "language_loss": 0.57465923, "learning_rate": 1.189625297463881e-07, "loss": 0.59471875, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.160731315612793 }, { "auxiliary_loss_clip": 0.01084909, "auxiliary_loss_mlp": 0.01020986, "balance_loss_clip": 1.03409195, "balance_loss_mlp": 1.01408708, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 1.5367520762357738, "language_loss": 0.79482758, "learning_rate": 1.1869802317854394e-07, "loss": 0.81588656, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 2.7322096824645996 }, { "auxiliary_loss_clip": 0.01113607, "auxiliary_loss_mlp": 0.01023409, "balance_loss_clip": 1.04364181, "balance_loss_mlp": 1.01615262, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 1.7343995445743285, "language_loss": 0.72074425, "learning_rate": 1.1843380200271425e-07, "loss": 0.74211448, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 2.5945115089416504 }, { "auxiliary_loss_clip": 0.01113214, "auxiliary_loss_mlp": 0.01026338, "balance_loss_clip": 1.04279768, "balance_loss_mlp": 1.01928401, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 1.6787148635646005, "language_loss": 0.80171049, "learning_rate": 1.181698662589805e-07, "loss": 0.82310599, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 3.3840930461883545 }, { "auxiliary_loss_clip": 0.01147868, "auxiliary_loss_mlp": 0.01022089, "balance_loss_clip": 1.04457545, "balance_loss_mlp": 1.01528823, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 1.7884042904628068, "language_loss": 0.76093256, "learning_rate": 1.1790621598738249e-07, "loss": 0.78263211, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.497098684310913 }, { "auxiliary_loss_clip": 0.0116352, "auxiliary_loss_mlp": 0.01019771, "balance_loss_clip": 1.05053544, "balance_loss_mlp": 1.01339912, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 1.825975352902499, "language_loss": 0.74477571, "learning_rate": 1.1764285122791461e-07, "loss": 0.76660866, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 2.509669065475464 }, { "auxiliary_loss_clip": 0.01148967, "auxiliary_loss_mlp": 0.01024102, "balance_loss_clip": 1.04490519, "balance_loss_mlp": 1.01721764, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 1.7320821003056215, "language_loss": 0.76656246, "learning_rate": 1.173797720205294e-07, "loss": 0.78829312, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.466038465499878 }, { "auxiliary_loss_clip": 0.01149589, "auxiliary_loss_mlp": 0.01024683, "balance_loss_clip": 1.04573584, "balance_loss_mlp": 1.01737273, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 2.102042160492293, "language_loss": 0.7177195, "learning_rate": 1.1711697840513602e-07, "loss": 0.7394622, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.6021134853363037 }, { "auxiliary_loss_clip": 0.01141241, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.04322314, "balance_loss_mlp": 1.02330542, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 1.9450508951035508, "language_loss": 0.70586014, "learning_rate": 1.1685447042160012e-07, "loss": 0.72756743, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.4583957195281982 }, { "auxiliary_loss_clip": 0.01165408, "auxiliary_loss_mlp": 0.0102299, "balance_loss_clip": 1.04764819, "balance_loss_mlp": 1.01574779, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 1.588797043846843, "language_loss": 0.71360916, "learning_rate": 1.1659224810974367e-07, "loss": 0.73549312, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.4584853649139404 }, { "auxiliary_loss_clip": 0.01136, "auxiliary_loss_mlp": 0.01031563, "balance_loss_clip": 1.04699516, "balance_loss_mlp": 1.02447927, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 2.909459196440415, "language_loss": 0.68370497, "learning_rate": 1.1633031150934591e-07, "loss": 0.70538062, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 2.605761766433716 }, { "auxiliary_loss_clip": 0.01153984, "auxiliary_loss_mlp": 0.01038199, "balance_loss_clip": 1.04989719, "balance_loss_mlp": 1.03088593, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 1.8993624182873627, "language_loss": 0.79636002, "learning_rate": 1.1606866066014176e-07, "loss": 0.81828189, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 2.4946765899658203 }, { "auxiliary_loss_clip": 0.01122198, "auxiliary_loss_mlp": 0.01028045, "balance_loss_clip": 1.04480362, "balance_loss_mlp": 1.02105618, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 2.2158719709381343, "language_loss": 0.75341177, "learning_rate": 1.1580729560182434e-07, "loss": 0.77491421, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 2.52472186088562 }, { "auxiliary_loss_clip": 0.01162116, "auxiliary_loss_mlp": 0.00760191, "balance_loss_clip": 1.0468812, "balance_loss_mlp": 1.00031042, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 1.6757554611644978, "language_loss": 0.70859236, "learning_rate": 1.1554621637404171e-07, "loss": 0.72781545, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 2.481964588165283 }, { "auxiliary_loss_clip": 0.01150441, "auxiliary_loss_mlp": 0.01023019, "balance_loss_clip": 1.04643595, "balance_loss_mlp": 1.01635528, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 2.112028418107696, "language_loss": 0.6068247, "learning_rate": 1.1528542301639999e-07, "loss": 0.62855923, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 3.166775941848755 }, { "auxiliary_loss_clip": 0.01121998, "auxiliary_loss_mlp": 0.01026113, "balance_loss_clip": 1.0417397, "balance_loss_mlp": 1.01856959, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 2.1974336730111164, "language_loss": 0.82231319, "learning_rate": 1.1502491556846105e-07, "loss": 0.84379423, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 2.5549275875091553 }, { "auxiliary_loss_clip": 0.01136374, "auxiliary_loss_mlp": 0.01023903, "balance_loss_clip": 1.04626226, "balance_loss_mlp": 1.01727486, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.1440468063828164, "language_loss": 0.81231463, "learning_rate": 1.1476469406974331e-07, "loss": 0.83391738, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 3.264305591583252 }, { "auxiliary_loss_clip": 0.01160518, "auxiliary_loss_mlp": 0.0102365, "balance_loss_clip": 1.04757857, "balance_loss_mlp": 1.01681352, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 1.5551516823389329, "language_loss": 0.76947987, "learning_rate": 1.1450475855972341e-07, "loss": 0.79132152, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 3.2135822772979736 }, { "auxiliary_loss_clip": 0.01131981, "auxiliary_loss_mlp": 0.00760756, "balance_loss_clip": 1.04137886, "balance_loss_mlp": 1.00028944, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 1.8687978739229985, "language_loss": 0.7036041, "learning_rate": 1.1424510907783158e-07, "loss": 0.72253144, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 2.5024240016937256 }, { "auxiliary_loss_clip": 0.01139113, "auxiliary_loss_mlp": 0.01022613, "balance_loss_clip": 1.04281092, "balance_loss_mlp": 1.0168221, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.6666500281271945, "language_loss": 0.82345879, "learning_rate": 1.1398574566345787e-07, "loss": 0.84507602, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.539337158203125 }, { "auxiliary_loss_clip": 0.01140588, "auxiliary_loss_mlp": 0.01026101, "balance_loss_clip": 1.04338813, "balance_loss_mlp": 1.01895475, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.078158186959809, "language_loss": 0.8251037, "learning_rate": 1.1372666835594702e-07, "loss": 0.84677064, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.5285720825195312 }, { "auxiliary_loss_clip": 0.01133592, "auxiliary_loss_mlp": 0.01020942, "balance_loss_clip": 1.0455569, "balance_loss_mlp": 1.01389694, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 1.8030804772716174, "language_loss": 0.72040135, "learning_rate": 1.1346787719460071e-07, "loss": 0.7419467, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.4996230602264404 }, { "auxiliary_loss_clip": 0.01133577, "auxiliary_loss_mlp": 0.01020934, "balance_loss_clip": 1.04350352, "balance_loss_mlp": 1.01446056, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 1.72502686710768, "language_loss": 0.72023404, "learning_rate": 1.1320937221867732e-07, "loss": 0.74177915, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.5138773918151855 }, { "auxiliary_loss_clip": 0.01131618, "auxiliary_loss_mlp": 0.01023663, "balance_loss_clip": 1.04153264, "balance_loss_mlp": 1.01722622, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.7951921376168438, "language_loss": 0.79578722, "learning_rate": 1.1295115346739192e-07, "loss": 0.81734002, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.5525810718536377 }, { "auxiliary_loss_clip": 0.01138899, "auxiliary_loss_mlp": 0.01026463, "balance_loss_clip": 1.04548371, "balance_loss_mlp": 1.01937556, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 2.4013256355457884, "language_loss": 0.7294091, "learning_rate": 1.1269322097991629e-07, "loss": 0.75106269, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 2.8006279468536377 }, { "auxiliary_loss_clip": 0.0115476, "auxiliary_loss_mlp": 0.01028667, "balance_loss_clip": 1.04804885, "balance_loss_mlp": 1.02125812, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 2.644634662661846, "language_loss": 0.67502606, "learning_rate": 1.1243557479537846e-07, "loss": 0.69686037, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.4947285652160645 }, { "auxiliary_loss_clip": 0.01163043, "auxiliary_loss_mlp": 0.01027613, "balance_loss_clip": 1.04613662, "balance_loss_mlp": 1.02032375, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 2.2007025409116805, "language_loss": 0.68958431, "learning_rate": 1.121782149528634e-07, "loss": 0.71149093, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.453657865524292 }, { "auxiliary_loss_clip": 0.01135394, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.04743433, "balance_loss_mlp": 1.02423954, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 1.8170416858581582, "language_loss": 0.78708577, "learning_rate": 1.1192114149141208e-07, "loss": 0.80874395, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.523460865020752 }, { "auxiliary_loss_clip": 0.01138004, "auxiliary_loss_mlp": 0.01027343, "balance_loss_clip": 1.04257989, "balance_loss_mlp": 1.01942205, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 2.1331954849948733, "language_loss": 0.6549015, "learning_rate": 1.1166435445002197e-07, "loss": 0.67655498, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.5036587715148926 }, { "auxiliary_loss_clip": 0.0115244, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04877079, "balance_loss_mlp": 1.01988792, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 1.9806021595522296, "language_loss": 0.68371624, "learning_rate": 1.1140785386764818e-07, "loss": 0.7055099, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 3.2580881118774414 }, { "auxiliary_loss_clip": 0.01142439, "auxiliary_loss_mlp": 0.01026517, "balance_loss_clip": 1.04297376, "balance_loss_mlp": 1.01957917, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 1.8482093808832363, "language_loss": 0.69741678, "learning_rate": 1.1115163978320153e-07, "loss": 0.71910632, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.4956843852996826 }, { "auxiliary_loss_clip": 0.01154136, "auxiliary_loss_mlp": 0.00760955, "balance_loss_clip": 1.04702401, "balance_loss_mlp": 1.00029397, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 2.1507491962526046, "language_loss": 0.82433355, "learning_rate": 1.1089571223554917e-07, "loss": 0.84348446, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.5518147945404053 }, { "auxiliary_loss_clip": 0.011488, "auxiliary_loss_mlp": 0.01025729, "balance_loss_clip": 1.04337263, "balance_loss_mlp": 1.0188266, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 1.6205393697015267, "language_loss": 0.85538733, "learning_rate": 1.1064007126351537e-07, "loss": 0.87713265, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.551846981048584 }, { "auxiliary_loss_clip": 0.01131153, "auxiliary_loss_mlp": 0.01025815, "balance_loss_clip": 1.04457664, "balance_loss_mlp": 1.01857305, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.2252109249181147, "language_loss": 0.76584154, "learning_rate": 1.1038471690588003e-07, "loss": 0.78741121, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.5679402351379395 }, { "auxiliary_loss_clip": 0.01106162, "auxiliary_loss_mlp": 0.01023798, "balance_loss_clip": 1.04195023, "balance_loss_mlp": 1.0170958, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 1.8712703579686236, "language_loss": 0.79938948, "learning_rate": 1.1012964920138145e-07, "loss": 0.82068908, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.611543893814087 }, { "auxiliary_loss_clip": 0.01129379, "auxiliary_loss_mlp": 0.01025774, "balance_loss_clip": 1.04149199, "balance_loss_mlp": 1.01869869, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.6115542468821047, "language_loss": 0.75844646, "learning_rate": 1.0987486818871205e-07, "loss": 0.779998, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.5886144638061523 }, { "auxiliary_loss_clip": 0.01148992, "auxiliary_loss_mlp": 0.00761069, "balance_loss_clip": 1.04524672, "balance_loss_mlp": 1.00036502, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 2.1668010811060694, "language_loss": 0.73092866, "learning_rate": 1.0962037390652245e-07, "loss": 0.75002927, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 2.4966514110565186 }, { "auxiliary_loss_clip": 0.01132905, "auxiliary_loss_mlp": 0.01028569, "balance_loss_clip": 1.04324603, "balance_loss_mlp": 1.0218513, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.6758241947683257, "language_loss": 0.71576154, "learning_rate": 1.0936616639341911e-07, "loss": 0.73737621, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 2.5411312580108643 }, { "auxiliary_loss_clip": 0.0103955, "auxiliary_loss_mlp": 0.01002419, "balance_loss_clip": 1.00897431, "balance_loss_mlp": 1.00139928, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.8129002436099186, "language_loss": 0.54798645, "learning_rate": 1.0911224568796473e-07, "loss": 0.56840611, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.110926866531372 }, { "auxiliary_loss_clip": 0.01146853, "auxiliary_loss_mlp": 0.01023963, "balance_loss_clip": 1.0460999, "balance_loss_mlp": 1.01699495, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 2.119088007614086, "language_loss": 0.70793706, "learning_rate": 1.0885861182867984e-07, "loss": 0.72964525, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 2.475480079650879 }, { "auxiliary_loss_clip": 0.0113641, "auxiliary_loss_mlp": 0.01027551, "balance_loss_clip": 1.04358995, "balance_loss_mlp": 1.02103615, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 13.499940689087065, "language_loss": 0.70925289, "learning_rate": 1.0860526485403942e-07, "loss": 0.73089254, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 3.430245876312256 }, { "auxiliary_loss_clip": 0.01162232, "auxiliary_loss_mlp": 0.01023247, "balance_loss_clip": 1.04652929, "balance_loss_mlp": 1.01653266, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 1.5223614169245245, "language_loss": 0.76937968, "learning_rate": 1.0835220480247675e-07, "loss": 0.79123449, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 2.4269371032714844 }, { "auxiliary_loss_clip": 0.01132855, "auxiliary_loss_mlp": 0.01028946, "balance_loss_clip": 1.04478419, "balance_loss_mlp": 1.02231479, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 2.0641071025158992, "language_loss": 0.83205509, "learning_rate": 1.0809943171238067e-07, "loss": 0.8536731, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 3.280802011489868 }, { "auxiliary_loss_clip": 0.01142962, "auxiliary_loss_mlp": 0.01030336, "balance_loss_clip": 1.04590416, "balance_loss_mlp": 1.02255774, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 3.0133644366609564, "language_loss": 0.63119388, "learning_rate": 1.078469456220965e-07, "loss": 0.65292692, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 3.2443292140960693 }, { "auxiliary_loss_clip": 0.01146077, "auxiliary_loss_mlp": 0.01023525, "balance_loss_clip": 1.04392767, "balance_loss_mlp": 1.01671243, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 1.7322224856893929, "language_loss": 0.69429421, "learning_rate": 1.0759474656992606e-07, "loss": 0.71599025, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.6585495471954346 }, { "auxiliary_loss_clip": 0.0113698, "auxiliary_loss_mlp": 0.01024885, "balance_loss_clip": 1.04101002, "balance_loss_mlp": 1.01792657, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 3.114738911094555, "language_loss": 0.78126603, "learning_rate": 1.0734283459412785e-07, "loss": 0.8028847, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 2.497157096862793 }, { "auxiliary_loss_clip": 0.01110702, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.04030585, "balance_loss_mlp": 1.0207206, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 1.6545965743637256, "language_loss": 0.80343813, "learning_rate": 1.0709120973291707e-07, "loss": 0.82483304, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.5817360877990723 }, { "auxiliary_loss_clip": 0.01164421, "auxiliary_loss_mlp": 0.01025965, "balance_loss_clip": 1.04827023, "balance_loss_mlp": 1.01858616, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 3.0913500382141312, "language_loss": 0.77688742, "learning_rate": 1.0683987202446475e-07, "loss": 0.79879129, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.4521634578704834 }, { "auxiliary_loss_clip": 0.0115223, "auxiliary_loss_mlp": 0.01026046, "balance_loss_clip": 1.04579496, "balance_loss_mlp": 1.01929832, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 1.795356077941309, "language_loss": 0.6985867, "learning_rate": 1.0658882150689862e-07, "loss": 0.72036946, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.4952924251556396 }, { "auxiliary_loss_clip": 0.01124242, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.04340482, "balance_loss_mlp": 1.02149093, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 2.2232479736372768, "language_loss": 0.78403056, "learning_rate": 1.0633805821830288e-07, "loss": 0.80556351, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.5213968753814697 }, { "auxiliary_loss_clip": 0.01136803, "auxiliary_loss_mlp": 0.01022219, "balance_loss_clip": 1.04563403, "balance_loss_mlp": 1.01549566, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 2.3011930441441995, "language_loss": 0.826065, "learning_rate": 1.0608758219671753e-07, "loss": 0.8476553, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.578660726547241 }, { "auxiliary_loss_clip": 0.01139468, "auxiliary_loss_mlp": 0.01029267, "balance_loss_clip": 1.04446089, "balance_loss_mlp": 1.02254677, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 1.5018307382456926, "language_loss": 0.70488739, "learning_rate": 1.0583739348014065e-07, "loss": 0.72657478, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.5316243171691895 }, { "auxiliary_loss_clip": 0.01163763, "auxiliary_loss_mlp": 0.01020837, "balance_loss_clip": 1.04833055, "balance_loss_mlp": 1.01399732, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 2.208276624875259, "language_loss": 0.84706175, "learning_rate": 1.0558749210652518e-07, "loss": 0.86890775, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.4788291454315186 }, { "auxiliary_loss_clip": 0.01126581, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.0435276, "balance_loss_mlp": 1.02159202, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 1.5644262317940125, "language_loss": 0.85515124, "learning_rate": 1.053378781137808e-07, "loss": 0.87670302, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.5802230834960938 }, { "auxiliary_loss_clip": 0.01134825, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.04352546, "balance_loss_mlp": 1.01906931, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 1.7254861305304443, "language_loss": 0.77797115, "learning_rate": 1.0508855153977392e-07, "loss": 0.79958814, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.487196683883667 }, { "auxiliary_loss_clip": 0.01148596, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.04328418, "balance_loss_mlp": 1.02265167, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.293590329276333, "language_loss": 0.67037427, "learning_rate": 1.0483951242232669e-07, "loss": 0.69215232, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 3.312591314315796 }, { "auxiliary_loss_clip": 0.0105507, "auxiliary_loss_mlp": 0.01000815, "balance_loss_clip": 1.00791121, "balance_loss_mlp": 0.99978334, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 0.9760248461513609, "language_loss": 0.57760805, "learning_rate": 1.0459076079921936e-07, "loss": 0.59816688, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.0893590450286865 }, { "auxiliary_loss_clip": 0.01131394, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.04457569, "balance_loss_mlp": 1.02456951, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.169149424833463, "language_loss": 0.84844077, "learning_rate": 1.0434229670818618e-07, "loss": 0.87007052, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 2.4851512908935547 }, { "auxiliary_loss_clip": 0.01129021, "auxiliary_loss_mlp": 0.01025345, "balance_loss_clip": 1.0440799, "balance_loss_mlp": 1.01846087, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 1.4448591315838006, "language_loss": 0.79861963, "learning_rate": 1.0409412018691944e-07, "loss": 0.82016325, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.5464789867401123 }, { "auxiliary_loss_clip": 0.01133274, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.04570222, "balance_loss_mlp": 1.02296865, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 2.0062908454965496, "language_loss": 0.75151974, "learning_rate": 1.0384623127306724e-07, "loss": 0.77315319, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.604987382888794 }, { "auxiliary_loss_clip": 0.01118831, "auxiliary_loss_mlp": 0.01025808, "balance_loss_clip": 1.04153621, "balance_loss_mlp": 1.01928115, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 1.720456731000409, "language_loss": 0.79304266, "learning_rate": 1.0359863000423397e-07, "loss": 0.81448907, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.5270373821258545 }, { "auxiliary_loss_clip": 0.01162524, "auxiliary_loss_mlp": 0.01028202, "balance_loss_clip": 1.0469166, "balance_loss_mlp": 1.02180052, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.7493665357723611, "language_loss": 0.71955711, "learning_rate": 1.0335131641798112e-07, "loss": 0.74146438, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 2.525736093521118 }, { "auxiliary_loss_clip": 0.01033556, "auxiliary_loss_mlp": 0.01000271, "balance_loss_clip": 1.00719714, "balance_loss_mlp": 0.99920416, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.8028463903110241, "language_loss": 0.55687737, "learning_rate": 1.0310429055182512e-07, "loss": 0.57721567, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 2.952817678451538 }, { "auxiliary_loss_clip": 0.01126393, "auxiliary_loss_mlp": 0.01026319, "balance_loss_clip": 1.04525256, "balance_loss_mlp": 1.01867735, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 1.5818077682595881, "language_loss": 0.73708564, "learning_rate": 1.0285755244324024e-07, "loss": 0.75861275, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.6101534366607666 }, { "auxiliary_loss_clip": 0.0113784, "auxiliary_loss_mlp": 0.00760625, "balance_loss_clip": 1.04205406, "balance_loss_mlp": 1.00032806, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 1.517151244312872, "language_loss": 0.68486702, "learning_rate": 1.0261110212965629e-07, "loss": 0.7038517, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 2.5561819076538086 }, { "auxiliary_loss_clip": 0.01133608, "auxiliary_loss_mlp": 0.01023006, "balance_loss_clip": 1.042647, "balance_loss_mlp": 1.01647377, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 1.9181939712576286, "language_loss": 0.79236823, "learning_rate": 1.023649396484596e-07, "loss": 0.81393439, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 2.5147461891174316 }, { "auxiliary_loss_clip": 0.01163252, "auxiliary_loss_mlp": 0.0102622, "balance_loss_clip": 1.04727352, "balance_loss_mlp": 1.01913345, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 1.7760916080221485, "language_loss": 0.67558622, "learning_rate": 1.0211906503699275e-07, "loss": 0.69748092, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 3.387376308441162 }, { "auxiliary_loss_clip": 0.01154197, "auxiliary_loss_mlp": 0.01024406, "balance_loss_clip": 1.04905343, "balance_loss_mlp": 1.01745939, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 2.7793425758749186, "language_loss": 0.8214485, "learning_rate": 1.0187347833255455e-07, "loss": 0.84323454, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 2.4649908542633057 }, { "auxiliary_loss_clip": 0.01161527, "auxiliary_loss_mlp": 0.01031461, "balance_loss_clip": 1.04779911, "balance_loss_mlp": 1.02460372, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 1.6534673814803902, "language_loss": 0.79717535, "learning_rate": 1.0162817957240056e-07, "loss": 0.81910527, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 3.2698261737823486 }, { "auxiliary_loss_clip": 0.01046369, "auxiliary_loss_mlp": 0.01001797, "balance_loss_clip": 1.00892019, "balance_loss_mlp": 1.00072384, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.9189728465620935, "language_loss": 0.63017696, "learning_rate": 1.0138316879374253e-07, "loss": 0.65065861, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.9736175537109375 }, { "auxiliary_loss_clip": 0.01141582, "auxiliary_loss_mlp": 0.01031062, "balance_loss_clip": 1.04859424, "balance_loss_mlp": 1.02373695, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 2.363472194692641, "language_loss": 0.74235451, "learning_rate": 1.0113844603374833e-07, "loss": 0.764081, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.5218915939331055 }, { "auxiliary_loss_clip": 0.01134547, "auxiliary_loss_mlp": 0.01029906, "balance_loss_clip": 1.04266858, "balance_loss_mlp": 1.0227567, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 2.1393418234394574, "language_loss": 0.71383053, "learning_rate": 1.0089401132954178e-07, "loss": 0.73547506, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 2.499688148498535 }, { "auxiliary_loss_clip": 0.01133996, "auxiliary_loss_mlp": 0.01022582, "balance_loss_clip": 1.04505587, "balance_loss_mlp": 1.01576638, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.6619970583703152, "language_loss": 0.72475213, "learning_rate": 1.006498647182037e-07, "loss": 0.74631792, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.534601926803589 }, { "auxiliary_loss_clip": 0.01092062, "auxiliary_loss_mlp": 0.01038246, "balance_loss_clip": 1.03939044, "balance_loss_mlp": 1.03099477, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 1.9075477075488705, "language_loss": 0.71420908, "learning_rate": 1.004060062367713e-07, "loss": 0.73551214, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.6510870456695557 }, { "auxiliary_loss_clip": 0.01151838, "auxiliary_loss_mlp": 0.01030875, "balance_loss_clip": 1.04721427, "balance_loss_mlp": 1.02427077, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 1.6451170771158339, "language_loss": 0.69276577, "learning_rate": 1.0016243592223728e-07, "loss": 0.71459293, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.5159363746643066 }, { "auxiliary_loss_clip": 0.01090219, "auxiliary_loss_mlp": 0.01024392, "balance_loss_clip": 1.04088032, "balance_loss_mlp": 1.01734066, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 1.9391163216616054, "language_loss": 0.65897995, "learning_rate": 9.991915381155114e-08, "loss": 0.68012601, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.724933385848999 }, { "auxiliary_loss_clip": 0.01152926, "auxiliary_loss_mlp": 0.01028176, "balance_loss_clip": 1.04628479, "balance_loss_mlp": 1.02101159, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 8.101052024098221, "language_loss": 0.74599332, "learning_rate": 9.967615994161871e-08, "loss": 0.76780427, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.529355764389038 }, { "auxiliary_loss_clip": 0.01161818, "auxiliary_loss_mlp": 0.01026779, "balance_loss_clip": 1.04751754, "balance_loss_mlp": 1.02033257, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 1.9214071505250454, "language_loss": 0.78274548, "learning_rate": 9.943345434930161e-08, "loss": 0.80463147, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.4573049545288086 }, { "auxiliary_loss_clip": 0.01120519, "auxiliary_loss_mlp": 0.01021157, "balance_loss_clip": 1.04462183, "balance_loss_mlp": 1.01421595, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 5.489373961705314, "language_loss": 0.6887958, "learning_rate": 9.919103707141885e-08, "loss": 0.71021259, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.543691635131836 }, { "auxiliary_loss_clip": 0.01149148, "auxiliary_loss_mlp": 0.01031081, "balance_loss_clip": 1.04666746, "balance_loss_mlp": 1.0233351, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 3.2708931545217967, "language_loss": 0.76516986, "learning_rate": 9.89489081447441e-08, "loss": 0.78697217, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.494748592376709 }, { "auxiliary_loss_clip": 0.011336, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.04212785, "balance_loss_mlp": 1.02092481, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.9243922285930786, "language_loss": 0.83041209, "learning_rate": 9.870706760600844e-08, "loss": 0.85203135, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.54723858833313 }, { "auxiliary_loss_clip": 0.01111143, "auxiliary_loss_mlp": 0.01034209, "balance_loss_clip": 1.0463407, "balance_loss_mlp": 1.0268209, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 1.912244342603789, "language_loss": 0.72878385, "learning_rate": 9.846551549189918e-08, "loss": 0.75023735, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 3.391357183456421 }, { "auxiliary_loss_clip": 0.01135264, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.04461706, "balance_loss_mlp": 1.02298379, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 1.9272336757262407, "language_loss": 0.68760484, "learning_rate": 9.822425183905902e-08, "loss": 0.70926219, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.626051187515259 }, { "auxiliary_loss_clip": 0.01024719, "auxiliary_loss_mlp": 0.01000882, "balance_loss_clip": 1.00777125, "balance_loss_mlp": 0.99979752, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.9233634456334515, "language_loss": 0.75238323, "learning_rate": 9.798327668408823e-08, "loss": 0.77263916, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.2298030853271484 }, { "auxiliary_loss_clip": 0.01166386, "auxiliary_loss_mlp": 0.01025971, "balance_loss_clip": 1.04777789, "balance_loss_mlp": 1.01833606, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 1.8709971239884235, "language_loss": 0.68870342, "learning_rate": 9.774259006354158e-08, "loss": 0.71062696, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.4815921783447266 }, { "auxiliary_loss_clip": 0.01140261, "auxiliary_loss_mlp": 0.01026776, "balance_loss_clip": 1.04457378, "balance_loss_mlp": 1.01932228, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 1.8341046496718703, "language_loss": 0.76209921, "learning_rate": 9.750219201393184e-08, "loss": 0.78376955, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 2.564682722091675 }, { "auxiliary_loss_clip": 0.01148868, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 1.04609859, "balance_loss_mlp": 1.02119088, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 1.662540853280006, "language_loss": 0.77706259, "learning_rate": 9.726208257172697e-08, "loss": 0.79882997, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.5492446422576904 }, { "auxiliary_loss_clip": 0.01163065, "auxiliary_loss_mlp": 0.01024551, "balance_loss_clip": 1.04717135, "balance_loss_mlp": 1.01743138, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 1.8620945613384838, "language_loss": 0.74410975, "learning_rate": 9.702226177335115e-08, "loss": 0.76598585, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.479057550430298 }, { "auxiliary_loss_clip": 0.01135288, "auxiliary_loss_mlp": 0.0102751, "balance_loss_clip": 1.04686522, "balance_loss_mlp": 1.02027977, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.5865305521496562, "language_loss": 0.72612453, "learning_rate": 9.67827296551853e-08, "loss": 0.74775255, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 2.555086851119995 }, { "auxiliary_loss_clip": 0.0112577, "auxiliary_loss_mlp": 0.00760089, "balance_loss_clip": 1.04067826, "balance_loss_mlp": 1.00036478, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 1.912592880445081, "language_loss": 0.68280953, "learning_rate": 9.65434862535659e-08, "loss": 0.70166814, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.533799409866333 }, { "auxiliary_loss_clip": 0.01140731, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.04582834, "balance_loss_mlp": 1.01943207, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 2.711142443370385, "language_loss": 0.65098083, "learning_rate": 9.630453160478635e-08, "loss": 0.67265797, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.478158950805664 }, { "auxiliary_loss_clip": 0.01108287, "auxiliary_loss_mlp": 0.01030156, "balance_loss_clip": 1.0411948, "balance_loss_mlp": 1.02347183, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 1.607858542849952, "language_loss": 0.82282364, "learning_rate": 9.60658657450959e-08, "loss": 0.84420806, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 2.613612174987793 }, { "auxiliary_loss_clip": 0.0112146, "auxiliary_loss_mlp": 0.01022674, "balance_loss_clip": 1.0411222, "balance_loss_mlp": 1.01551819, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 1.6204822065489703, "language_loss": 0.79641712, "learning_rate": 9.582748871069979e-08, "loss": 0.81785846, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 3.3233110904693604 }, { "auxiliary_loss_clip": 0.01137355, "auxiliary_loss_mlp": 0.007602, "balance_loss_clip": 1.04342592, "balance_loss_mlp": 1.00032616, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 1.870590205917733, "language_loss": 0.83027351, "learning_rate": 9.558940053775954e-08, "loss": 0.84924906, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.5785863399505615 }, { "auxiliary_loss_clip": 0.01151513, "auxiliary_loss_mlp": 0.01028688, "balance_loss_clip": 1.04853773, "balance_loss_mlp": 1.02175021, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 2.1657524778564294, "language_loss": 0.67661536, "learning_rate": 9.535160126239294e-08, "loss": 0.69841731, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 3.266892910003662 }, { "auxiliary_loss_clip": 0.01148134, "auxiliary_loss_mlp": 0.01025196, "balance_loss_clip": 1.04645658, "balance_loss_mlp": 1.01859772, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.5060776854808053, "language_loss": 0.70682383, "learning_rate": 9.511409092067424e-08, "loss": 0.72855717, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 3.2174365520477295 }, { "auxiliary_loss_clip": 0.01138973, "auxiliary_loss_mlp": 0.01025597, "balance_loss_clip": 1.0475347, "balance_loss_mlp": 1.01856637, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 1.8186389945338424, "language_loss": 0.67198241, "learning_rate": 9.487686954863327e-08, "loss": 0.69362807, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 2.5293962955474854 }, { "auxiliary_loss_clip": 0.0114983, "auxiliary_loss_mlp": 0.01031605, "balance_loss_clip": 1.04782748, "balance_loss_mlp": 1.02510488, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 1.8205543415507535, "language_loss": 0.7695173, "learning_rate": 9.46399371822566e-08, "loss": 0.79133153, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.5196595191955566 }, { "auxiliary_loss_clip": 0.01163308, "auxiliary_loss_mlp": 0.01027828, "balance_loss_clip": 1.04664922, "balance_loss_mlp": 1.02060938, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 2.229582675228149, "language_loss": 0.72821891, "learning_rate": 9.440329385748657e-08, "loss": 0.7501303, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.434082269668579 }, { "auxiliary_loss_clip": 0.01123108, "auxiliary_loss_mlp": 0.01018678, "balance_loss_clip": 1.04491329, "balance_loss_mlp": 1.01294672, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 1.840842167576267, "language_loss": 0.70582676, "learning_rate": 9.416693961022137e-08, "loss": 0.72724462, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.5202794075012207 }, { "auxiliary_loss_clip": 0.01083129, "auxiliary_loss_mlp": 0.01024806, "balance_loss_clip": 1.03909945, "balance_loss_mlp": 1.0178982, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 1.7449261672084901, "language_loss": 0.77196246, "learning_rate": 9.393087447631654e-08, "loss": 0.79304183, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.636296510696411 }, { "auxiliary_loss_clip": 0.01132883, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.04326558, "balance_loss_mlp": 1.01952469, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 2.1402821765207625, "language_loss": 0.72586107, "learning_rate": 9.36950984915823e-08, "loss": 0.7474488, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.523245334625244 }, { "auxiliary_loss_clip": 0.01165154, "auxiliary_loss_mlp": 0.0102427, "balance_loss_clip": 1.04834485, "balance_loss_mlp": 1.01737952, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 1.7336743434797648, "language_loss": 0.69104588, "learning_rate": 9.345961169178607e-08, "loss": 0.7129401, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.478794574737549 }, { "auxiliary_loss_clip": 0.01107854, "auxiliary_loss_mlp": 0.01026674, "balance_loss_clip": 1.04412365, "balance_loss_mlp": 1.01927733, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 1.423533797434014, "language_loss": 0.72768432, "learning_rate": 9.322441411265081e-08, "loss": 0.74902958, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.5558722019195557 }, { "auxiliary_loss_clip": 0.01130476, "auxiliary_loss_mlp": 0.0102897, "balance_loss_clip": 1.04458952, "balance_loss_mlp": 1.02193379, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 3.1735858840069917, "language_loss": 0.73347962, "learning_rate": 9.298950578985554e-08, "loss": 0.75507408, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.506119966506958 }, { "auxiliary_loss_clip": 0.01146269, "auxiliary_loss_mlp": 0.0076101, "balance_loss_clip": 1.04767561, "balance_loss_mlp": 1.00027633, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.6220055736747767, "language_loss": 0.70979416, "learning_rate": 9.275488675903665e-08, "loss": 0.72886693, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.483637571334839 }, { "auxiliary_loss_clip": 0.01101013, "auxiliary_loss_mlp": 0.01026166, "balance_loss_clip": 1.04042864, "balance_loss_mlp": 1.01949286, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 2.3981714014860085, "language_loss": 0.73598546, "learning_rate": 9.252055705578454e-08, "loss": 0.75725722, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.6007566452026367 }, { "auxiliary_loss_clip": 0.01149821, "auxiliary_loss_mlp": 0.01023329, "balance_loss_clip": 1.0455687, "balance_loss_mlp": 1.01702881, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 1.606993608403782, "language_loss": 0.72470391, "learning_rate": 9.228651671564747e-08, "loss": 0.7464354, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 3.3082590103149414 }, { "auxiliary_loss_clip": 0.01105317, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.04433846, "balance_loss_mlp": 1.02868271, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.5158012277626711, "language_loss": 0.77756178, "learning_rate": 9.205276577412901e-08, "loss": 0.79896772, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 2.639434814453125 }, { "auxiliary_loss_clip": 0.0114068, "auxiliary_loss_mlp": 0.0076091, "balance_loss_clip": 1.04350185, "balance_loss_mlp": 1.00029182, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.407995880019666, "language_loss": 0.76448059, "learning_rate": 9.181930426668905e-08, "loss": 0.78349644, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.494900703430176 }, { "auxiliary_loss_clip": 0.01096602, "auxiliary_loss_mlp": 0.01029124, "balance_loss_clip": 1.03869987, "balance_loss_mlp": 1.02246869, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.5263092272394707, "language_loss": 0.67867208, "learning_rate": 9.158613222874346e-08, "loss": 0.69992936, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 2.65517258644104 }, { "auxiliary_loss_clip": 0.01134373, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.04233241, "balance_loss_mlp": 1.0181849, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.446165840771521, "language_loss": 0.81793916, "learning_rate": 9.135324969566394e-08, "loss": 0.83953393, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 2.5262868404388428 }, { "auxiliary_loss_clip": 0.01156796, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.04909849, "balance_loss_mlp": 1.02260256, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 1.823513552141641, "language_loss": 0.75465423, "learning_rate": 9.112065670277913e-08, "loss": 0.77651656, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.4454779624938965 }, { "auxiliary_loss_clip": 0.01130129, "auxiliary_loss_mlp": 0.01029316, "balance_loss_clip": 1.04104197, "balance_loss_mlp": 1.02326334, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 2.327920122028646, "language_loss": 0.72749496, "learning_rate": 9.088835328537303e-08, "loss": 0.74908936, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 2.621633768081665 }, { "auxiliary_loss_clip": 0.01138969, "auxiliary_loss_mlp": 0.01028803, "balance_loss_clip": 1.04578805, "balance_loss_mlp": 1.02209103, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 4.072086667351616, "language_loss": 0.71408296, "learning_rate": 9.065633947868568e-08, "loss": 0.73576069, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.535513162612915 }, { "auxiliary_loss_clip": 0.01121636, "auxiliary_loss_mlp": 0.0076052, "balance_loss_clip": 1.04599166, "balance_loss_mlp": 1.0003221, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 2.282306810981983, "language_loss": 0.80220795, "learning_rate": 9.042461531791379e-08, "loss": 0.82102948, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 2.593374013900757 }, { "auxiliary_loss_clip": 0.01161202, "auxiliary_loss_mlp": 0.01021919, "balance_loss_clip": 1.04690135, "balance_loss_mlp": 1.01529741, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 1.975492595357323, "language_loss": 0.77750731, "learning_rate": 9.019318083820903e-08, "loss": 0.79933858, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.436610221862793 }, { "auxiliary_loss_clip": 0.01147791, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.04545748, "balance_loss_mlp": 1.0201019, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.5727841745463118, "language_loss": 0.85113698, "learning_rate": 8.996203607468045e-08, "loss": 0.87288958, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 2.534026622772217 }, { "auxiliary_loss_clip": 0.01144078, "auxiliary_loss_mlp": 0.01023249, "balance_loss_clip": 1.0432986, "balance_loss_mlp": 1.01621902, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 1.4193650147364238, "language_loss": 0.75306237, "learning_rate": 8.973118106239241e-08, "loss": 0.77473569, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 3.2642009258270264 }, { "auxiliary_loss_clip": 0.01090416, "auxiliary_loss_mlp": 0.01024592, "balance_loss_clip": 1.03621578, "balance_loss_mlp": 1.01818156, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 2.1578681035509804, "language_loss": 0.9473592, "learning_rate": 8.95006158363656e-08, "loss": 0.96850926, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 2.6673049926757812 }, { "auxiliary_loss_clip": 0.01149132, "auxiliary_loss_mlp": 0.01027212, "balance_loss_clip": 1.04726994, "balance_loss_mlp": 1.02003872, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 1.7200648792727475, "language_loss": 0.77602291, "learning_rate": 8.9270340431576e-08, "loss": 0.79778636, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 3.2761638164520264 }, { "auxiliary_loss_clip": 0.01150345, "auxiliary_loss_mlp": 0.01027081, "balance_loss_clip": 1.04522121, "balance_loss_mlp": 1.02053618, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 2.510602769705035, "language_loss": 0.73221153, "learning_rate": 8.904035488295658e-08, "loss": 0.75398576, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.6206932067871094 }, { "auxiliary_loss_clip": 0.01045243, "auxiliary_loss_mlp": 0.00751252, "balance_loss_clip": 1.00834501, "balance_loss_mlp": 1.00020397, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.7852770751793497, "language_loss": 0.53266501, "learning_rate": 8.881065922539632e-08, "loss": 0.55062997, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 3.6873855590820312 }, { "auxiliary_loss_clip": 0.01117293, "auxiliary_loss_mlp": 0.01019854, "balance_loss_clip": 1.04580522, "balance_loss_mlp": 1.01326776, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 1.5490261587677663, "language_loss": 0.73296881, "learning_rate": 8.85812534937389e-08, "loss": 0.75434029, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.5423717498779297 }, { "auxiliary_loss_clip": 0.01151272, "auxiliary_loss_mlp": 0.01029209, "balance_loss_clip": 1.04697967, "balance_loss_mlp": 1.02211297, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 3.0055969639560853, "language_loss": 0.67277849, "learning_rate": 8.835213772278583e-08, "loss": 0.6945833, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.44325590133667 }, { "auxiliary_loss_clip": 0.01111639, "auxiliary_loss_mlp": 0.01024116, "balance_loss_clip": 1.04295588, "balance_loss_mlp": 1.0177834, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 1.6366405887197504, "language_loss": 0.7880832, "learning_rate": 8.812331194729373e-08, "loss": 0.80944073, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.6109108924865723 }, { "auxiliary_loss_clip": 0.01169213, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.0513829, "balance_loss_mlp": 1.01949823, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 2.611961233924004, "language_loss": 0.72062993, "learning_rate": 8.789477620197461e-08, "loss": 0.74259186, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.4790303707122803 }, { "auxiliary_loss_clip": 0.01134125, "auxiliary_loss_mlp": 0.01026913, "balance_loss_clip": 1.0432651, "balance_loss_mlp": 1.01968288, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 2.3104927277797547, "language_loss": 0.78947598, "learning_rate": 8.766653052149831e-08, "loss": 0.81108636, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.4895284175872803 }, { "auxiliary_loss_clip": 0.01134531, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.04576635, "balance_loss_mlp": 1.02182019, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 1.9361150589004221, "language_loss": 0.74410826, "learning_rate": 8.743857494048823e-08, "loss": 0.76573884, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.501981735229492 }, { "auxiliary_loss_clip": 0.01120887, "auxiliary_loss_mlp": 0.01025541, "balance_loss_clip": 1.04382849, "balance_loss_mlp": 1.01859975, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 2.0410445202792586, "language_loss": 0.62748158, "learning_rate": 8.721090949352605e-08, "loss": 0.64894581, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.526177167892456 }, { "auxiliary_loss_clip": 0.01154262, "auxiliary_loss_mlp": 0.01026022, "balance_loss_clip": 1.04744649, "balance_loss_mlp": 1.01896501, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 2.0122490505668376, "language_loss": 0.72797829, "learning_rate": 8.698353421514793e-08, "loss": 0.74978113, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.4722578525543213 }, { "auxiliary_loss_clip": 0.01148186, "auxiliary_loss_mlp": 0.01025716, "balance_loss_clip": 1.04579067, "balance_loss_mlp": 1.01889396, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 2.171532126397396, "language_loss": 0.80733806, "learning_rate": 8.67564491398467e-08, "loss": 0.82907706, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.4457597732543945 }, { "auxiliary_loss_clip": 0.01151811, "auxiliary_loss_mlp": 0.01028934, "balance_loss_clip": 1.04480004, "balance_loss_mlp": 1.02179325, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 2.3899239024787176, "language_loss": 0.73316777, "learning_rate": 8.652965430207104e-08, "loss": 0.7549752, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.475050926208496 }, { "auxiliary_loss_clip": 0.01151165, "auxiliary_loss_mlp": 0.01025351, "balance_loss_clip": 1.04496205, "balance_loss_mlp": 1.01823425, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 1.8815850001306704, "language_loss": 0.65349221, "learning_rate": 8.630314973622521e-08, "loss": 0.67525738, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 3.26001238822937 }, { "auxiliary_loss_clip": 0.01147953, "auxiliary_loss_mlp": 0.01028371, "balance_loss_clip": 1.04617262, "balance_loss_mlp": 1.02212417, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 2.026271983969225, "language_loss": 0.70879865, "learning_rate": 8.607693547666995e-08, "loss": 0.73056185, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 2.586787462234497 }, { "auxiliary_loss_clip": 0.01025958, "auxiliary_loss_mlp": 0.01002715, "balance_loss_clip": 1.00737965, "balance_loss_mlp": 1.00158262, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.8726116464099366, "language_loss": 0.57981509, "learning_rate": 8.585101155772201e-08, "loss": 0.60010183, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.21933650970459 }, { "auxiliary_loss_clip": 0.01125591, "auxiliary_loss_mlp": 0.01025783, "balance_loss_clip": 1.0395143, "balance_loss_mlp": 1.01854062, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 1.778374627565462, "language_loss": 0.68543702, "learning_rate": 8.562537801365377e-08, "loss": 0.70695078, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 2.546579360961914 }, { "auxiliary_loss_clip": 0.01162379, "auxiliary_loss_mlp": 0.01027932, "balance_loss_clip": 1.0461601, "balance_loss_mlp": 1.01995742, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 1.9257046278900358, "language_loss": 0.69977242, "learning_rate": 8.540003487869362e-08, "loss": 0.72167552, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 2.484055280685425 }, { "auxiliary_loss_clip": 0.01112529, "auxiliary_loss_mlp": 0.010251, "balance_loss_clip": 1.04301, "balance_loss_mlp": 1.01788497, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 1.844099305913209, "language_loss": 0.79580057, "learning_rate": 8.517498218702557e-08, "loss": 0.81717682, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.5626189708709717 }, { "auxiliary_loss_clip": 0.01117337, "auxiliary_loss_mlp": 0.01029199, "balance_loss_clip": 1.04170656, "balance_loss_mlp": 1.02232933, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.6948992639755918, "language_loss": 0.69345891, "learning_rate": 8.49502199727905e-08, "loss": 0.71492434, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 2.542144775390625 }, { "auxiliary_loss_clip": 0.01145289, "auxiliary_loss_mlp": 0.01027902, "balance_loss_clip": 1.04350722, "balance_loss_mlp": 1.02067232, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 5.589564210642016, "language_loss": 0.66102481, "learning_rate": 8.472574827008428e-08, "loss": 0.68275678, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.6063830852508545 }, { "auxiliary_loss_clip": 0.01148016, "auxiliary_loss_mlp": 0.0102421, "balance_loss_clip": 1.04351974, "balance_loss_mlp": 1.01717114, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 1.6929746349108834, "language_loss": 0.83766961, "learning_rate": 8.450156711295942e-08, "loss": 0.85939181, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 2.497229814529419 }, { "auxiliary_loss_clip": 0.01133517, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.04612648, "balance_loss_mlp": 1.02771449, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 2.1338451038583965, "language_loss": 0.86242801, "learning_rate": 8.427767653542383e-08, "loss": 0.88410759, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.5368642807006836 }, { "auxiliary_loss_clip": 0.01103682, "auxiliary_loss_mlp": 0.01026653, "balance_loss_clip": 1.03922594, "balance_loss_mlp": 1.02015591, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 1.8211968564229994, "language_loss": 0.70042145, "learning_rate": 8.405407657144125e-08, "loss": 0.72172481, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 2.6055727005004883 }, { "auxiliary_loss_clip": 0.01128917, "auxiliary_loss_mlp": 0.01024834, "balance_loss_clip": 1.04210973, "balance_loss_mlp": 1.01840305, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 1.7920105393130261, "language_loss": 0.72402555, "learning_rate": 8.383076725493232e-08, "loss": 0.74556309, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 3.356362819671631 }, { "auxiliary_loss_clip": 0.01150461, "auxiliary_loss_mlp": 0.01030769, "balance_loss_clip": 1.04566216, "balance_loss_mlp": 1.02394724, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 2.438517523463585, "language_loss": 0.67788219, "learning_rate": 8.360774861977216e-08, "loss": 0.69969445, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 2.515854835510254 }, { "auxiliary_loss_clip": 0.01133068, "auxiliary_loss_mlp": 0.01022958, "balance_loss_clip": 1.04054654, "balance_loss_mlp": 1.01591611, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 2.6053419148419805, "language_loss": 0.74603403, "learning_rate": 8.338502069979281e-08, "loss": 0.76759422, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 3.3281149864196777 }, { "auxiliary_loss_clip": 0.01148677, "auxiliary_loss_mlp": 0.01027907, "balance_loss_clip": 1.04203296, "balance_loss_mlp": 1.02125812, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 2.7164739864052287, "language_loss": 0.79584217, "learning_rate": 8.316258352878214e-08, "loss": 0.817608, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 3.200885534286499 }, { "auxiliary_loss_clip": 0.01151119, "auxiliary_loss_mlp": 0.01027292, "balance_loss_clip": 1.04419303, "balance_loss_mlp": 1.02024376, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 2.131237362347271, "language_loss": 0.71113896, "learning_rate": 8.294043714048338e-08, "loss": 0.73292309, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.5337460041046143 }, { "auxiliary_loss_clip": 0.01035554, "auxiliary_loss_mlp": 0.01002711, "balance_loss_clip": 1.00780129, "balance_loss_mlp": 1.00164366, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7484370302359629, "language_loss": 0.60466337, "learning_rate": 8.271858156859624e-08, "loss": 0.62504601, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 3.1795601844787598 }, { "auxiliary_loss_clip": 0.01161013, "auxiliary_loss_mlp": 0.01026157, "balance_loss_clip": 1.04716873, "balance_loss_mlp": 1.01930821, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.6803964587103293, "language_loss": 0.73678803, "learning_rate": 8.249701684677557e-08, "loss": 0.75865972, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.514474868774414 }, { "auxiliary_loss_clip": 0.01151259, "auxiliary_loss_mlp": 0.01021193, "balance_loss_clip": 1.04952598, "balance_loss_mlp": 1.01452661, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 1.876702403247439, "language_loss": 0.80815327, "learning_rate": 8.227574300863294e-08, "loss": 0.82987779, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.5656697750091553 }, { "auxiliary_loss_clip": 0.01142499, "auxiliary_loss_mlp": 0.01025657, "balance_loss_clip": 1.04783595, "balance_loss_mlp": 1.01811075, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 1.9086796943853797, "language_loss": 0.69513214, "learning_rate": 8.205476008773548e-08, "loss": 0.71681362, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.7710695266723633 }, { "auxiliary_loss_clip": 0.01117324, "auxiliary_loss_mlp": 0.01027804, "balance_loss_clip": 1.04531395, "balance_loss_mlp": 1.02112579, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 1.922384569197043, "language_loss": 0.82464808, "learning_rate": 8.183406811760596e-08, "loss": 0.84609938, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.6167683601379395 }, { "auxiliary_loss_clip": 0.01108468, "auxiliary_loss_mlp": 0.01026596, "balance_loss_clip": 1.04010248, "balance_loss_mlp": 1.0198245, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.5082556565134873, "language_loss": 0.74320632, "learning_rate": 8.161366713172313e-08, "loss": 0.76455694, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.612069606781006 }, { "auxiliary_loss_clip": 0.01124495, "auxiliary_loss_mlp": 0.01026737, "balance_loss_clip": 1.04134321, "balance_loss_mlp": 1.01938796, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 4.163482342106094, "language_loss": 0.83741069, "learning_rate": 8.139355716352137e-08, "loss": 0.85892296, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.508976459503174 }, { "auxiliary_loss_clip": 0.01132288, "auxiliary_loss_mlp": 0.01020814, "balance_loss_clip": 1.04162025, "balance_loss_mlp": 1.0139451, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 1.5723226242354638, "language_loss": 0.69902802, "learning_rate": 8.117373824639196e-08, "loss": 0.720559, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.5303585529327393 }, { "auxiliary_loss_clip": 0.01054885, "auxiliary_loss_mlp": 0.01002451, "balance_loss_clip": 1.00805807, "balance_loss_mlp": 1.00134873, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.7196073539075255, "language_loss": 0.59311998, "learning_rate": 8.095421041368067e-08, "loss": 0.61369324, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 2.9779539108276367 }, { "auxiliary_loss_clip": 0.01134467, "auxiliary_loss_mlp": 0.00760285, "balance_loss_clip": 1.04592323, "balance_loss_mlp": 1.00033593, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 2.4930885094509123, "language_loss": 0.70695943, "learning_rate": 8.073497369868999e-08, "loss": 0.72590697, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.5402300357818604 }, { "auxiliary_loss_clip": 0.01140915, "auxiliary_loss_mlp": 0.01023157, "balance_loss_clip": 1.04343152, "balance_loss_mlp": 1.01683617, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 1.612863218258099, "language_loss": 0.75454444, "learning_rate": 8.051602813467772e-08, "loss": 0.77618515, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 3.4162771701812744 }, { "auxiliary_loss_clip": 0.01149014, "auxiliary_loss_mlp": 0.01023531, "balance_loss_clip": 1.04425538, "balance_loss_mlp": 1.01683128, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 1.668319483739733, "language_loss": 0.71101958, "learning_rate": 8.029737375485756e-08, "loss": 0.73274505, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.477029800415039 }, { "auxiliary_loss_clip": 0.01162993, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.04717922, "balance_loss_mlp": 1.02051342, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.653902586310946, "language_loss": 0.72636092, "learning_rate": 8.007901059239986e-08, "loss": 0.74826616, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.448899507522583 }, { "auxiliary_loss_clip": 0.01133115, "auxiliary_loss_mlp": 0.01024162, "balance_loss_clip": 1.04195976, "balance_loss_mlp": 1.01746583, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.5544616259969288, "language_loss": 0.80106419, "learning_rate": 7.986093868042964e-08, "loss": 0.82263696, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.532444477081299 }, { "auxiliary_loss_clip": 0.01147864, "auxiliary_loss_mlp": 0.01021184, "balance_loss_clip": 1.04648232, "balance_loss_mlp": 1.01471996, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 1.674883673837388, "language_loss": 0.67329061, "learning_rate": 7.964315805202826e-08, "loss": 0.6949811, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 2.520984172821045 }, { "auxiliary_loss_clip": 0.01134078, "auxiliary_loss_mlp": 0.01025898, "balance_loss_clip": 1.0443126, "balance_loss_mlp": 1.01915395, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 1.7308612547128135, "language_loss": 0.73333454, "learning_rate": 7.942566874023304e-08, "loss": 0.75493431, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 2.4996166229248047 }, { "auxiliary_loss_clip": 0.01127169, "auxiliary_loss_mlp": 0.01033028, "balance_loss_clip": 1.04226947, "balance_loss_mlp": 1.02483225, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.0085203203352964, "language_loss": 0.6975863, "learning_rate": 7.920847077803649e-08, "loss": 0.71918827, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 2.4885244369506836 }, { "auxiliary_loss_clip": 0.01092627, "auxiliary_loss_mlp": 0.01019204, "balance_loss_clip": 1.03519893, "balance_loss_mlp": 1.01259983, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 2.362125182599027, "language_loss": 0.81808072, "learning_rate": 7.899156419838826e-08, "loss": 0.83919901, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.5745208263397217 }, { "auxiliary_loss_clip": 0.01118931, "auxiliary_loss_mlp": 0.01030498, "balance_loss_clip": 1.04184592, "balance_loss_mlp": 1.02328312, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 1.7234350189664545, "language_loss": 0.656106, "learning_rate": 7.87749490341918e-08, "loss": 0.67760026, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 2.58345103263855 }, { "auxiliary_loss_clip": 0.01168036, "auxiliary_loss_mlp": 0.01028403, "balance_loss_clip": 1.04951072, "balance_loss_mlp": 1.02085698, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 2.0449800844392514, "language_loss": 0.83369398, "learning_rate": 7.855862531830836e-08, "loss": 0.85565835, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.4445040225982666 }, { "auxiliary_loss_clip": 0.01148846, "auxiliary_loss_mlp": 0.01027627, "balance_loss_clip": 1.04569328, "balance_loss_mlp": 1.02048695, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 1.5804590516655308, "language_loss": 0.72351074, "learning_rate": 7.834259308355373e-08, "loss": 0.74527544, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 2.4833874702453613 }, { "auxiliary_loss_clip": 0.0108027, "auxiliary_loss_mlp": 0.01023183, "balance_loss_clip": 1.03818226, "balance_loss_mlp": 1.01564908, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 2.2696034878088307, "language_loss": 0.75150251, "learning_rate": 7.812685236269989e-08, "loss": 0.77253711, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 3.397035837173462 }, { "auxiliary_loss_clip": 0.01020404, "auxiliary_loss_mlp": 0.0100237, "balance_loss_clip": 1.00854731, "balance_loss_mlp": 1.00130928, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.7844215204050851, "language_loss": 0.58630592, "learning_rate": 7.791140318847445e-08, "loss": 0.60653365, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 3.146777391433716 }, { "auxiliary_loss_clip": 0.01128526, "auxiliary_loss_mlp": 0.01029518, "balance_loss_clip": 1.04531741, "balance_loss_mlp": 1.02323258, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 1.3714477827516112, "language_loss": 0.80482507, "learning_rate": 7.769624559356081e-08, "loss": 0.82640553, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 3.358421564102173 }, { "auxiliary_loss_clip": 0.01147613, "auxiliary_loss_mlp": 0.01029275, "balance_loss_clip": 1.04457498, "balance_loss_mlp": 1.02186918, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 3.4236457144369212, "language_loss": 0.7511667, "learning_rate": 7.748137961059842e-08, "loss": 0.77293557, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 3.2068989276885986 }, { "auxiliary_loss_clip": 0.01160585, "auxiliary_loss_mlp": 0.01029418, "balance_loss_clip": 1.04772818, "balance_loss_mlp": 1.02249467, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 2.4270296233738606, "language_loss": 0.65085131, "learning_rate": 7.726680527218211e-08, "loss": 0.67275131, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.444509983062744 }, { "auxiliary_loss_clip": 0.01162236, "auxiliary_loss_mlp": 0.01023511, "balance_loss_clip": 1.04538989, "balance_loss_mlp": 1.01621008, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.6821811758077685, "language_loss": 0.75212038, "learning_rate": 7.70525226108627e-08, "loss": 0.77397788, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.659435272216797 }, { "auxiliary_loss_clip": 0.01152963, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.0503062, "balance_loss_mlp": 1.02727878, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 2.3001563851001188, "language_loss": 0.80010855, "learning_rate": 7.683853165914666e-08, "loss": 0.82197392, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.4902050495147705 }, { "auxiliary_loss_clip": 0.01109763, "auxiliary_loss_mlp": 0.01025391, "balance_loss_clip": 1.04364681, "balance_loss_mlp": 1.01871502, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 1.6347512659649743, "language_loss": 0.76883543, "learning_rate": 7.662483244949602e-08, "loss": 0.790187, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.5740301609039307 }, { "auxiliary_loss_clip": 0.01115067, "auxiliary_loss_mlp": 0.01026546, "balance_loss_clip": 1.04444349, "balance_loss_mlp": 1.02031398, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.694413738064927, "language_loss": 0.80692017, "learning_rate": 7.641142501432951e-08, "loss": 0.82833636, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.530841588973999 }, { "auxiliary_loss_clip": 0.01130488, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.04268813, "balance_loss_mlp": 1.02282429, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.6552615819023204, "language_loss": 0.73690373, "learning_rate": 7.619830938602013e-08, "loss": 0.75850326, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.6221320629119873 }, { "auxiliary_loss_clip": 0.01145039, "auxiliary_loss_mlp": 0.01024009, "balance_loss_clip": 1.04577768, "balance_loss_mlp": 1.01713085, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 1.8434438042807417, "language_loss": 0.82353497, "learning_rate": 7.598548559689777e-08, "loss": 0.84522545, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.4995813369750977 }, { "auxiliary_loss_clip": 0.0111373, "auxiliary_loss_mlp": 0.01023209, "balance_loss_clip": 1.04017198, "balance_loss_mlp": 1.01641154, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 2.291711731950592, "language_loss": 0.80965096, "learning_rate": 7.577295367924751e-08, "loss": 0.83102036, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.5469439029693604 }, { "auxiliary_loss_clip": 0.01142697, "auxiliary_loss_mlp": 0.010285, "balance_loss_clip": 1.04737306, "balance_loss_mlp": 1.02126372, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 2.0029119937614417, "language_loss": 0.82331216, "learning_rate": 7.556071366531002e-08, "loss": 0.84502417, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.6050970554351807 }, { "auxiliary_loss_clip": 0.01151521, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.04830337, "balance_loss_mlp": 1.02551305, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 2.528880563200342, "language_loss": 0.79028058, "learning_rate": 7.53487655872822e-08, "loss": 0.81212521, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.5132739543914795 }, { "auxiliary_loss_clip": 0.01109561, "auxiliary_loss_mlp": 0.01023197, "balance_loss_clip": 1.03917956, "balance_loss_mlp": 1.01619077, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 1.781831466687112, "language_loss": 0.73945796, "learning_rate": 7.513710947731656e-08, "loss": 0.76078558, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.6211729049682617 }, { "auxiliary_loss_clip": 0.01127431, "auxiliary_loss_mlp": 0.01026982, "balance_loss_clip": 1.0442096, "balance_loss_mlp": 1.02016664, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.8031782647416035, "language_loss": 0.85209954, "learning_rate": 7.492574536752095e-08, "loss": 0.87364364, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 3.2480111122131348 }, { "auxiliary_loss_clip": 0.01146702, "auxiliary_loss_mlp": 0.01027465, "balance_loss_clip": 1.04769027, "balance_loss_mlp": 1.02095938, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 6.364382044526774, "language_loss": 0.77900672, "learning_rate": 7.471467328995907e-08, "loss": 0.80074847, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.53262996673584 }, { "auxiliary_loss_clip": 0.01070618, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.03813684, "balance_loss_mlp": 1.02139759, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.347366691655838, "language_loss": 0.6079179, "learning_rate": 7.450389327665018e-08, "loss": 0.62891054, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.8319075107574463 }, { "auxiliary_loss_clip": 0.01120756, "auxiliary_loss_mlp": 0.01027243, "balance_loss_clip": 1.04529476, "balance_loss_mlp": 1.01942563, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 2.7700185991227695, "language_loss": 0.68109465, "learning_rate": 7.429340535957029e-08, "loss": 0.70257461, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 2.690930128097534 }, { "auxiliary_loss_clip": 0.01135555, "auxiliary_loss_mlp": 0.0101869, "balance_loss_clip": 1.04243076, "balance_loss_mlp": 1.01211572, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 2.7225583354817284, "language_loss": 0.7059145, "learning_rate": 7.40832095706494e-08, "loss": 0.72745693, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 2.521418809890747 }, { "auxiliary_loss_clip": 0.01126548, "auxiliary_loss_mlp": 0.0102514, "balance_loss_clip": 1.04341173, "balance_loss_mlp": 1.01855385, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 2.6464187009315374, "language_loss": 0.80182397, "learning_rate": 7.387330594177443e-08, "loss": 0.82334089, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 2.581125259399414 }, { "auxiliary_loss_clip": 0.01115453, "auxiliary_loss_mlp": 0.01028869, "balance_loss_clip": 1.04231119, "balance_loss_mlp": 1.02160907, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 1.4893695217982585, "language_loss": 0.79170477, "learning_rate": 7.366369450478749e-08, "loss": 0.81314802, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 2.5845160484313965 }, { "auxiliary_loss_clip": 0.01116327, "auxiliary_loss_mlp": 0.01025437, "balance_loss_clip": 1.04126167, "balance_loss_mlp": 1.01874983, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 1.6218259885877768, "language_loss": 0.65835381, "learning_rate": 7.345437529148646e-08, "loss": 0.67977142, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 2.596672534942627 }, { "auxiliary_loss_clip": 0.011206, "auxiliary_loss_mlp": 0.01025212, "balance_loss_clip": 1.04291439, "balance_loss_mlp": 1.01829493, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 1.911550740348573, "language_loss": 0.7288968, "learning_rate": 7.324534833362483e-08, "loss": 0.75035489, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.5913751125335693 }, { "auxiliary_loss_clip": 0.01134808, "auxiliary_loss_mlp": 0.01023366, "balance_loss_clip": 1.04469776, "balance_loss_mlp": 1.01701534, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 1.651393194593592, "language_loss": 0.68594521, "learning_rate": 7.303661366291192e-08, "loss": 0.70752692, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 2.6063106060028076 }, { "auxiliary_loss_clip": 0.01104819, "auxiliary_loss_mlp": 0.01030129, "balance_loss_clip": 1.04145694, "balance_loss_mlp": 1.02268767, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.9198077000930927, "language_loss": 0.81618559, "learning_rate": 7.28281713110126e-08, "loss": 0.83753502, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 2.597595453262329 }, { "auxiliary_loss_clip": 0.01130016, "auxiliary_loss_mlp": 0.01025551, "balance_loss_clip": 1.04380679, "balance_loss_mlp": 1.0192275, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 2.793982684816331, "language_loss": 0.76973128, "learning_rate": 7.262002130954759e-08, "loss": 0.79128695, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 3.491931676864624 }, { "auxiliary_loss_clip": 0.01109372, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.0401125, "balance_loss_mlp": 1.02505565, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 1.6972904929415682, "language_loss": 0.78923273, "learning_rate": 7.241216369009296e-08, "loss": 0.81064731, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 2.653470754623413 }, { "auxiliary_loss_clip": 0.01163369, "auxiliary_loss_mlp": 0.01020043, "balance_loss_clip": 1.04645801, "balance_loss_mlp": 1.01263428, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 1.7446276334253201, "language_loss": 0.66181791, "learning_rate": 7.220459848418037e-08, "loss": 0.68365204, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 3.3519842624664307 }, { "auxiliary_loss_clip": 0.01163257, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.04965782, "balance_loss_mlp": 1.02661633, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 1.7172007192672034, "language_loss": 0.79445904, "learning_rate": 7.199732572329708e-08, "loss": 0.81642246, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 3.1799309253692627 }, { "auxiliary_loss_clip": 0.01120547, "auxiliary_loss_mlp": 0.01023164, "balance_loss_clip": 1.04170394, "balance_loss_mlp": 1.01595187, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.3849200558109747, "language_loss": 0.75836414, "learning_rate": 7.179034543888684e-08, "loss": 0.77980125, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 2.611210823059082 }, { "auxiliary_loss_clip": 0.01151292, "auxiliary_loss_mlp": 0.01023841, "balance_loss_clip": 1.04535723, "balance_loss_mlp": 1.01707017, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 1.8098665338264077, "language_loss": 0.77830321, "learning_rate": 7.158365766234808e-08, "loss": 0.80005455, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.489579916000366 }, { "auxiliary_loss_clip": 0.01112197, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.03877556, "balance_loss_mlp": 1.02153707, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 2.0221178805315145, "language_loss": 0.7216112, "learning_rate": 7.137726242503527e-08, "loss": 0.74302042, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.557910680770874 }, { "auxiliary_loss_clip": 0.01150318, "auxiliary_loss_mlp": 0.00761213, "balance_loss_clip": 1.04813313, "balance_loss_mlp": 1.00035548, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 2.3480818567928874, "language_loss": 0.78097928, "learning_rate": 7.11711597582585e-08, "loss": 0.8000946, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.446298122406006 }, { "auxiliary_loss_clip": 0.01119533, "auxiliary_loss_mlp": 0.01024136, "balance_loss_clip": 1.03844666, "balance_loss_mlp": 1.01770782, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 1.654209540867679, "language_loss": 0.79634386, "learning_rate": 7.096534969328271e-08, "loss": 0.81778055, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.530721426010132 }, { "auxiliary_loss_clip": 0.01137156, "auxiliary_loss_mlp": 0.01025454, "balance_loss_clip": 1.04204464, "balance_loss_mlp": 1.01889157, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 1.932714441210669, "language_loss": 0.84176052, "learning_rate": 7.075983226132987e-08, "loss": 0.86338663, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.520890474319458 }, { "auxiliary_loss_clip": 0.01139553, "auxiliary_loss_mlp": 0.00761617, "balance_loss_clip": 1.04341424, "balance_loss_mlp": 1.00027621, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 2.3541079266607574, "language_loss": 0.7901963, "learning_rate": 7.055460749357656e-08, "loss": 0.80920804, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.5131471157073975 }, { "auxiliary_loss_clip": 0.01136688, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.0470295, "balance_loss_mlp": 1.02491856, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 1.699099630802657, "language_loss": 0.70207202, "learning_rate": 7.034967542115521e-08, "loss": 0.72376353, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.4944441318511963 }, { "auxiliary_loss_clip": 0.01138893, "auxiliary_loss_mlp": 0.00760725, "balance_loss_clip": 1.04306471, "balance_loss_mlp": 1.00033927, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 1.9738854432377848, "language_loss": 0.75105989, "learning_rate": 7.014503607515388e-08, "loss": 0.77005601, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.497682571411133 }, { "auxiliary_loss_clip": 0.0113733, "auxiliary_loss_mlp": 0.01029411, "balance_loss_clip": 1.04787517, "balance_loss_mlp": 1.0220325, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 1.9087921325872663, "language_loss": 0.68544447, "learning_rate": 6.994068948661592e-08, "loss": 0.70711184, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.540978193283081 }, { "auxiliary_loss_clip": 0.01149167, "auxiliary_loss_mlp": 0.01031273, "balance_loss_clip": 1.04533041, "balance_loss_mlp": 1.02329838, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 2.4350864347472987, "language_loss": 0.7638514, "learning_rate": 6.973663568654142e-08, "loss": 0.78565586, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.4792795181274414 }, { "auxiliary_loss_clip": 0.01163302, "auxiliary_loss_mlp": 0.01026916, "balance_loss_clip": 1.04792368, "balance_loss_mlp": 1.0201509, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 1.8881347905107015, "language_loss": 0.64834678, "learning_rate": 6.953287470588386e-08, "loss": 0.67024893, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 3.2862117290496826 }, { "auxiliary_loss_clip": 0.0115111, "auxiliary_loss_mlp": 0.01024622, "balance_loss_clip": 1.04413092, "balance_loss_mlp": 1.01743364, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.052889639735495, "language_loss": 0.86048305, "learning_rate": 6.932940657555452e-08, "loss": 0.88224036, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.4721901416778564 }, { "auxiliary_loss_clip": 0.01159272, "auxiliary_loss_mlp": 0.0102763, "balance_loss_clip": 1.04758525, "balance_loss_mlp": 1.02170861, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 1.4310083669181788, "language_loss": 0.76293689, "learning_rate": 6.912623132641938e-08, "loss": 0.78480589, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.6046388149261475 }, { "auxiliary_loss_clip": 0.01137234, "auxiliary_loss_mlp": 0.01029021, "balance_loss_clip": 1.04549706, "balance_loss_mlp": 1.0220294, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 2.063562705998082, "language_loss": 0.76478601, "learning_rate": 6.892334898929952e-08, "loss": 0.7864486, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 2.5810108184814453 }, { "auxiliary_loss_clip": 0.01143908, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.04497612, "balance_loss_mlp": 1.02258897, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 1.7825278264456614, "language_loss": 0.84419233, "learning_rate": 6.872075959497236e-08, "loss": 0.86592293, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.568774700164795 }, { "auxiliary_loss_clip": 0.01146538, "auxiliary_loss_mlp": 0.01022037, "balance_loss_clip": 1.04335272, "balance_loss_mlp": 1.01515305, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 1.8068815669015137, "language_loss": 0.827196, "learning_rate": 6.85184631741702e-08, "loss": 0.84888184, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 2.5405378341674805 }, { "auxiliary_loss_clip": 0.01148639, "auxiliary_loss_mlp": 0.01028518, "balance_loss_clip": 1.04562688, "balance_loss_mlp": 1.02168393, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 1.7555574032176964, "language_loss": 0.77377486, "learning_rate": 6.831645975758161e-08, "loss": 0.79554641, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 2.5203957557678223 }, { "auxiliary_loss_clip": 0.0112788, "auxiliary_loss_mlp": 0.01029337, "balance_loss_clip": 1.04475319, "balance_loss_mlp": 1.02151072, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 1.8401938935219608, "language_loss": 0.67278928, "learning_rate": 6.811474937585026e-08, "loss": 0.69436145, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.6572232246398926 }, { "auxiliary_loss_clip": 0.01116884, "auxiliary_loss_mlp": 0.01023769, "balance_loss_clip": 1.04226041, "balance_loss_mlp": 1.01792765, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 1.530598345694022, "language_loss": 0.79114401, "learning_rate": 6.79133320595755e-08, "loss": 0.81255054, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 2.5722286701202393 }, { "auxiliary_loss_clip": 0.01138763, "auxiliary_loss_mlp": 0.01026953, "balance_loss_clip": 1.04649782, "balance_loss_mlp": 1.02017033, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 1.641734230474859, "language_loss": 0.75438553, "learning_rate": 6.771220783931198e-08, "loss": 0.7760427, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 2.5558085441589355 }, { "auxiliary_loss_clip": 0.00980346, "auxiliary_loss_mlp": 0.00751022, "balance_loss_clip": 1.00983763, "balance_loss_mlp": 1.00013161, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 1.0445576493932889, "language_loss": 0.64613622, "learning_rate": 6.751137674556994e-08, "loss": 0.66344988, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 3.5503108501434326 }, { "auxiliary_loss_clip": 0.01151318, "auxiliary_loss_mlp": 0.01024447, "balance_loss_clip": 1.04374385, "balance_loss_mlp": 1.01704395, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 1.9933636741664869, "language_loss": 0.77341312, "learning_rate": 6.731083880881572e-08, "loss": 0.79517072, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 3.7403910160064697 }, { "auxiliary_loss_clip": 0.01136027, "auxiliary_loss_mlp": 0.01022362, "balance_loss_clip": 1.0446409, "balance_loss_mlp": 1.01582348, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 2.1386908998783363, "language_loss": 0.8096754, "learning_rate": 6.711059405947072e-08, "loss": 0.83125925, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 2.536228895187378 }, { "auxiliary_loss_clip": 0.01118412, "auxiliary_loss_mlp": 0.01029819, "balance_loss_clip": 1.04436958, "balance_loss_mlp": 1.02311361, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 1.7655479541017438, "language_loss": 0.76958871, "learning_rate": 6.691064252791156e-08, "loss": 0.791071, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 3.616182565689087 }, { "auxiliary_loss_clip": 0.01099499, "auxiliary_loss_mlp": 0.01028985, "balance_loss_clip": 1.04271019, "balance_loss_mlp": 1.02212405, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 1.5454846052045323, "language_loss": 0.77973354, "learning_rate": 6.67109842444713e-08, "loss": 0.80101836, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 3.2830073833465576 }, { "auxiliary_loss_clip": 0.01144519, "auxiliary_loss_mlp": 0.0076055, "balance_loss_clip": 1.0459187, "balance_loss_mlp": 1.0002923, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 1.8069501323315724, "language_loss": 0.76650584, "learning_rate": 6.651161923943704e-08, "loss": 0.78555655, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 2.4761130809783936 }, { "auxiliary_loss_clip": 0.01142858, "auxiliary_loss_mlp": 0.01027117, "balance_loss_clip": 1.04346418, "balance_loss_mlp": 1.01994646, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 1.7565639733266192, "language_loss": 0.77056587, "learning_rate": 6.631254754305326e-08, "loss": 0.79226565, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.488304376602173 }, { "auxiliary_loss_clip": 0.01162746, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.04471147, "balance_loss_mlp": 1.01813638, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 1.8481650229219, "language_loss": 0.78227305, "learning_rate": 6.611376918551848e-08, "loss": 0.80415189, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.453503370285034 }, { "auxiliary_loss_clip": 0.01113769, "auxiliary_loss_mlp": 0.00761005, "balance_loss_clip": 1.04037952, "balance_loss_mlp": 1.00028729, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 2.9978394828784096, "language_loss": 0.79303151, "learning_rate": 6.591528419698744e-08, "loss": 0.81177926, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.5455081462860107 }, { "auxiliary_loss_clip": 0.01135557, "auxiliary_loss_mlp": 0.01022893, "balance_loss_clip": 1.04204512, "balance_loss_mlp": 1.01635468, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.3235160184181334, "language_loss": 0.83418882, "learning_rate": 6.571709260756986e-08, "loss": 0.85577333, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.5028228759765625 }, { "auxiliary_loss_clip": 0.0115188, "auxiliary_loss_mlp": 0.01028458, "balance_loss_clip": 1.04898262, "balance_loss_mlp": 1.02177036, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 2.559173880076541, "language_loss": 0.76446748, "learning_rate": 6.551919444733122e-08, "loss": 0.78627086, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.484921455383301 }, { "auxiliary_loss_clip": 0.01136669, "auxiliary_loss_mlp": 0.01021647, "balance_loss_clip": 1.0469178, "balance_loss_mlp": 1.01493263, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 1.8815481446419366, "language_loss": 0.6600818, "learning_rate": 6.53215897462931e-08, "loss": 0.681665, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 2.827063798904419 }, { "auxiliary_loss_clip": 0.01147051, "auxiliary_loss_mlp": 0.01034329, "balance_loss_clip": 1.04507172, "balance_loss_mlp": 1.02699709, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 2.654986365830183, "language_loss": 0.74559379, "learning_rate": 6.512427853443103e-08, "loss": 0.76740766, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.5660204887390137 }, { "auxiliary_loss_clip": 0.01149746, "auxiliary_loss_mlp": 0.01022222, "balance_loss_clip": 1.0452745, "balance_loss_mlp": 1.01537967, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 1.531669066675027, "language_loss": 0.75934571, "learning_rate": 6.492726084167799e-08, "loss": 0.7810654, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.5592429637908936 }, { "auxiliary_loss_clip": 0.01054367, "auxiliary_loss_mlp": 0.01002413, "balance_loss_clip": 1.00750971, "balance_loss_mlp": 1.00133991, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 0.8361272992307527, "language_loss": 0.57546914, "learning_rate": 6.473053669792072e-08, "loss": 0.59603697, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 2.932178020477295 }, { "auxiliary_loss_clip": 0.0114688, "auxiliary_loss_mlp": 0.01024243, "balance_loss_clip": 1.04416251, "balance_loss_mlp": 1.01713848, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 2.1867030196138173, "language_loss": 0.73093599, "learning_rate": 6.453410613300248e-08, "loss": 0.75264728, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.475830316543579 }, { "auxiliary_loss_clip": 0.01095204, "auxiliary_loss_mlp": 0.01027805, "balance_loss_clip": 1.04213417, "balance_loss_mlp": 1.02088761, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 1.5267614312039506, "language_loss": 0.57827556, "learning_rate": 6.43379691767214e-08, "loss": 0.59950566, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 3.497004985809326 }, { "auxiliary_loss_clip": 0.01014622, "auxiliary_loss_mlp": 0.01001189, "balance_loss_clip": 1.00709343, "balance_loss_mlp": 1.00010383, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7203311196117483, "language_loss": 0.55173892, "learning_rate": 6.414212585883105e-08, "loss": 0.57189703, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.227815866470337 }, { "auxiliary_loss_clip": 0.01137951, "auxiliary_loss_mlp": 0.01025273, "balance_loss_clip": 1.04566455, "balance_loss_mlp": 1.01812339, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 1.5303492803399512, "language_loss": 0.69705069, "learning_rate": 6.394657620904143e-08, "loss": 0.71868289, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.634403944015503 }, { "auxiliary_loss_clip": 0.01164804, "auxiliary_loss_mlp": 0.01027977, "balance_loss_clip": 1.04753089, "balance_loss_mlp": 1.02046633, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 1.7269338344895377, "language_loss": 0.72010994, "learning_rate": 6.375132025701657e-08, "loss": 0.74203771, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.5288872718811035 }, { "auxiliary_loss_clip": 0.01166009, "auxiliary_loss_mlp": 0.01028483, "balance_loss_clip": 1.04954195, "balance_loss_mlp": 1.02062988, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 2.163519874451551, "language_loss": 0.68889356, "learning_rate": 6.355635803237724e-08, "loss": 0.71083844, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 2.4355993270874023 }, { "auxiliary_loss_clip": 0.01146817, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.04396963, "balance_loss_mlp": 1.01987791, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 2.4882929743183095, "language_loss": 0.7968623, "learning_rate": 6.336168956469867e-08, "loss": 0.8186022, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.4568698406219482 }, { "auxiliary_loss_clip": 0.01128038, "auxiliary_loss_mlp": 0.01031168, "balance_loss_clip": 1.04400909, "balance_loss_mlp": 1.02473962, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 1.5655812884996205, "language_loss": 0.72036123, "learning_rate": 6.316731488351168e-08, "loss": 0.74195331, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 2.537635326385498 }, { "auxiliary_loss_clip": 0.01149486, "auxiliary_loss_mlp": 0.01022331, "balance_loss_clip": 1.04684746, "balance_loss_mlp": 1.01532197, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 2.6677993551257435, "language_loss": 0.63433349, "learning_rate": 6.297323401830334e-08, "loss": 0.65605164, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 2.469575881958008 }, { "auxiliary_loss_clip": 0.01150467, "auxiliary_loss_mlp": 0.0103165, "balance_loss_clip": 1.04553425, "balance_loss_mlp": 1.02476549, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 1.872931138501021, "language_loss": 0.68843591, "learning_rate": 6.277944699851523e-08, "loss": 0.71025711, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 2.4808924198150635 }, { "auxiliary_loss_clip": 0.01161546, "auxiliary_loss_mlp": 0.01028702, "balance_loss_clip": 1.04596746, "balance_loss_mlp": 1.02195752, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 1.7435335082504113, "language_loss": 0.73247039, "learning_rate": 6.25859538535447e-08, "loss": 0.75437284, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.4509432315826416 }, { "auxiliary_loss_clip": 0.01136767, "auxiliary_loss_mlp": 0.01026948, "balance_loss_clip": 1.04678154, "balance_loss_mlp": 1.02005434, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 2.7573546752160736, "language_loss": 0.78039718, "learning_rate": 6.239275461274474e-08, "loss": 0.80203438, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 2.4919064044952393 }, { "auxiliary_loss_clip": 0.01150165, "auxiliary_loss_mlp": 0.01026909, "balance_loss_clip": 1.04740798, "balance_loss_mlp": 1.02021241, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 1.7452956926787306, "language_loss": 0.8596037, "learning_rate": 6.219984930542299e-08, "loss": 0.88137448, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 3.305701971054077 }, { "auxiliary_loss_clip": 0.01152825, "auxiliary_loss_mlp": 0.01027955, "balance_loss_clip": 1.04665768, "balance_loss_mlp": 1.02040935, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 2.4401691037694073, "language_loss": 0.75740826, "learning_rate": 6.200723796084383e-08, "loss": 0.77921605, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 2.4476311206817627 }, { "auxiliary_loss_clip": 0.01030115, "auxiliary_loss_mlp": 0.01001803, "balance_loss_clip": 1.01000547, "balance_loss_mlp": 1.00061703, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.7593497909037988, "language_loss": 0.63055813, "learning_rate": 6.181492060822546e-08, "loss": 0.65087736, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 3.8189635276794434 }, { "auxiliary_loss_clip": 0.01104482, "auxiliary_loss_mlp": 0.01024712, "balance_loss_clip": 1.03963017, "balance_loss_mlp": 1.01816702, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.944341498075886, "language_loss": 0.82152832, "learning_rate": 6.162289727674274e-08, "loss": 0.84282029, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 3.2444796562194824 }, { "auxiliary_loss_clip": 0.01119827, "auxiliary_loss_mlp": 0.01023433, "balance_loss_clip": 1.04198241, "balance_loss_mlp": 1.01734114, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 2.233484917530925, "language_loss": 0.87798786, "learning_rate": 6.143116799552527e-08, "loss": 0.8994205, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 2.523056745529175 }, { "auxiliary_loss_clip": 0.01151207, "auxiliary_loss_mlp": 0.01024648, "balance_loss_clip": 1.04581594, "balance_loss_mlp": 1.01764464, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 2.5184850434258146, "language_loss": 0.55793488, "learning_rate": 6.123973279365802e-08, "loss": 0.57969338, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.5386502742767334 }, { "auxiliary_loss_clip": 0.01152278, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.04766941, "balance_loss_mlp": 1.0297296, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 1.7298326821859793, "language_loss": 0.77812696, "learning_rate": 6.10485917001824e-08, "loss": 0.80001462, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.478100061416626 }, { "auxiliary_loss_clip": 0.01132932, "auxiliary_loss_mlp": 0.01026849, "balance_loss_clip": 1.04174495, "balance_loss_mlp": 1.02018499, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 1.492200027632409, "language_loss": 0.80973506, "learning_rate": 6.085774474409322e-08, "loss": 0.8313328, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.618298292160034 }, { "auxiliary_loss_clip": 0.01137618, "auxiliary_loss_mlp": 0.01028144, "balance_loss_clip": 1.04773974, "balance_loss_mlp": 1.02126586, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 1.7709358630121466, "language_loss": 0.6976428, "learning_rate": 6.066719195434267e-08, "loss": 0.71930045, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.497868061065674 }, { "auxiliary_loss_clip": 0.0115231, "auxiliary_loss_mlp": 0.01030494, "balance_loss_clip": 1.0473659, "balance_loss_mlp": 1.02299595, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.1840117427667165, "language_loss": 0.66323727, "learning_rate": 6.047693335983717e-08, "loss": 0.68506533, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.5644378662109375 }, { "auxiliary_loss_clip": 0.01152606, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.04630017, "balance_loss_mlp": 1.02244711, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 2.33429427601771, "language_loss": 0.82335985, "learning_rate": 6.028696898943853e-08, "loss": 0.84518981, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.503946542739868 }, { "auxiliary_loss_clip": 0.01136309, "auxiliary_loss_mlp": 0.00761067, "balance_loss_clip": 1.04324627, "balance_loss_mlp": 1.00031281, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 1.9877791811572185, "language_loss": 0.71079314, "learning_rate": 6.00972988719648e-08, "loss": 0.72976685, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.5226075649261475 }, { "auxiliary_loss_clip": 0.01123058, "auxiliary_loss_mlp": 0.0076083, "balance_loss_clip": 1.04249895, "balance_loss_mlp": 1.00030708, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 2.3109812188963095, "language_loss": 0.70687747, "learning_rate": 5.990792303618807e-08, "loss": 0.72571629, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.602893114089966 }, { "auxiliary_loss_clip": 0.01118792, "auxiliary_loss_mlp": 0.01020924, "balance_loss_clip": 1.04255605, "balance_loss_mlp": 1.01460934, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.8341891207524899, "language_loss": 0.69449145, "learning_rate": 5.971884151083695e-08, "loss": 0.71588862, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.647028684616089 }, { "auxiliary_loss_clip": 0.011395, "auxiliary_loss_mlp": 0.01027212, "balance_loss_clip": 1.04611504, "balance_loss_mlp": 1.02057505, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 1.778803184095275, "language_loss": 0.74242657, "learning_rate": 5.9530054324595124e-08, "loss": 0.76409376, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.587827205657959 }, { "auxiliary_loss_clip": 0.01041105, "auxiliary_loss_mlp": 0.00751281, "balance_loss_clip": 1.00688946, "balance_loss_mlp": 1.0001992, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.72228600628726, "language_loss": 0.57581925, "learning_rate": 5.934156150610103e-08, "loss": 0.59374309, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.8479158878326416 }, { "auxiliary_loss_clip": 0.01133574, "auxiliary_loss_mlp": 0.01024408, "balance_loss_clip": 1.0433979, "balance_loss_mlp": 1.01757705, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 2.240030638223351, "language_loss": 0.79165953, "learning_rate": 5.915336308394914e-08, "loss": 0.81323934, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.5376858711242676 }, { "auxiliary_loss_clip": 0.01145682, "auxiliary_loss_mlp": 0.01025048, "balance_loss_clip": 1.04516292, "balance_loss_mlp": 1.01835418, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.5318764621139471, "language_loss": 0.7683115, "learning_rate": 5.89654590866886e-08, "loss": 0.7900188, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.500533103942871 }, { "auxiliary_loss_clip": 0.01096829, "auxiliary_loss_mlp": 0.01023055, "balance_loss_clip": 1.04382539, "balance_loss_mlp": 1.01563454, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 2.040693574887011, "language_loss": 0.88080895, "learning_rate": 5.877784954282483e-08, "loss": 0.90200782, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.6274616718292236 }, { "auxiliary_loss_clip": 0.01152969, "auxiliary_loss_mlp": 0.01022513, "balance_loss_clip": 1.0463562, "balance_loss_mlp": 1.01519954, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 1.9847138999261853, "language_loss": 0.72499931, "learning_rate": 5.8590534480817963e-08, "loss": 0.74675417, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 2.5713694095611572 }, { "auxiliary_loss_clip": 0.01166021, "auxiliary_loss_mlp": 0.01025598, "balance_loss_clip": 1.04954076, "balance_loss_mlp": 1.01806998, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 2.277819387373324, "language_loss": 0.72236764, "learning_rate": 5.840351392908349e-08, "loss": 0.7442838, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.433427572250366 }, { "auxiliary_loss_clip": 0.01140136, "auxiliary_loss_mlp": 0.00760635, "balance_loss_clip": 1.04330206, "balance_loss_mlp": 1.00028861, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 2.833469425040787, "language_loss": 0.70580804, "learning_rate": 5.821678791599205e-08, "loss": 0.72481573, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 2.5653109550476074 }, { "auxiliary_loss_clip": 0.01137014, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.04621196, "balance_loss_mlp": 1.02102828, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 1.878808402566749, "language_loss": 0.80588621, "learning_rate": 5.803035646986965e-08, "loss": 0.82753408, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 2.5599372386932373 }, { "auxiliary_loss_clip": 0.01166877, "auxiliary_loss_mlp": 0.01025303, "balance_loss_clip": 1.04754817, "balance_loss_mlp": 1.01762569, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 2.3349298933423266, "language_loss": 0.67308688, "learning_rate": 5.7844219618998766e-08, "loss": 0.6950087, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 2.4302470684051514 }, { "auxiliary_loss_clip": 0.01106442, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.03803778, "balance_loss_mlp": 1.01894546, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 2.1474153535272236, "language_loss": 0.71497786, "learning_rate": 5.765837739161505e-08, "loss": 0.7363019, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 2.5706520080566406 }, { "auxiliary_loss_clip": 0.01121558, "auxiliary_loss_mlp": 0.01028679, "balance_loss_clip": 1.04336882, "balance_loss_mlp": 1.02218246, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 1.5731078966653718, "language_loss": 0.7434563, "learning_rate": 5.7472829815911504e-08, "loss": 0.76495862, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.558093786239624 }, { "auxiliary_loss_clip": 0.01131411, "auxiliary_loss_mlp": 0.01024095, "balance_loss_clip": 1.0439713, "balance_loss_mlp": 1.01727343, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 1.8267780707858625, "language_loss": 0.81573522, "learning_rate": 5.7287576920035164e-08, "loss": 0.83729029, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 3.320439100265503 }, { "auxiliary_loss_clip": 0.01118183, "auxiliary_loss_mlp": 0.01022343, "balance_loss_clip": 1.04314661, "balance_loss_mlp": 1.01566458, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 1.771719580741003, "language_loss": 0.768305, "learning_rate": 5.7102618732088435e-08, "loss": 0.78971028, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 2.600755214691162 }, { "auxiliary_loss_clip": 0.01137605, "auxiliary_loss_mlp": 0.01025472, "balance_loss_clip": 1.04596698, "balance_loss_mlp": 1.01834607, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.5709040201079039, "language_loss": 0.74462175, "learning_rate": 5.6917955280130216e-08, "loss": 0.76625252, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 4.111975431442261 }, { "auxiliary_loss_clip": 0.01145589, "auxiliary_loss_mlp": 0.01027265, "balance_loss_clip": 1.04545581, "balance_loss_mlp": 1.02010059, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.9937019391656765, "language_loss": 0.7193619, "learning_rate": 5.6733586592172755e-08, "loss": 0.74109054, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.493257999420166 }, { "auxiliary_loss_clip": 0.0112948, "auxiliary_loss_mlp": 0.00760277, "balance_loss_clip": 1.04116893, "balance_loss_mlp": 1.0003171, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 1.8827491365675897, "language_loss": 0.79998046, "learning_rate": 5.6549512696185244e-08, "loss": 0.81887805, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 2.5413458347320557 }, { "auxiliary_loss_clip": 0.01163565, "auxiliary_loss_mlp": 0.01026835, "balance_loss_clip": 1.04940867, "balance_loss_mlp": 1.02008772, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 1.6228881400248352, "language_loss": 0.67940199, "learning_rate": 5.636573362009156e-08, "loss": 0.70130599, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 2.4741077423095703 }, { "auxiliary_loss_clip": 0.01166428, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.04773843, "balance_loss_mlp": 1.02265787, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 2.1476169126112743, "language_loss": 0.77088559, "learning_rate": 5.618224939177074e-08, "loss": 0.7928524, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.427630662918091 }, { "auxiliary_loss_clip": 0.01124429, "auxiliary_loss_mlp": 0.01027445, "balance_loss_clip": 1.04173565, "balance_loss_mlp": 1.02034664, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 1.8727036330982585, "language_loss": 0.70504987, "learning_rate": 5.599906003905719e-08, "loss": 0.72656858, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.6413724422454834 }, { "auxiliary_loss_clip": 0.0114242, "auxiliary_loss_mlp": 0.01025685, "balance_loss_clip": 1.04546571, "balance_loss_mlp": 1.01860142, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.0856011696089514, "language_loss": 0.81887889, "learning_rate": 5.581616558974023e-08, "loss": 0.84055996, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.4676475524902344 }, { "auxiliary_loss_clip": 0.01155274, "auxiliary_loss_mlp": 0.0076077, "balance_loss_clip": 1.04701793, "balance_loss_mlp": 1.00028646, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 1.720788183393249, "language_loss": 0.79004109, "learning_rate": 5.5633566071565444e-08, "loss": 0.80920154, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.5171151161193848 }, { "auxiliary_loss_clip": 0.01096315, "auxiliary_loss_mlp": 0.01019848, "balance_loss_clip": 1.03965235, "balance_loss_mlp": 1.01311827, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 1.9725667982392892, "language_loss": 0.70748883, "learning_rate": 5.5451261512232896e-08, "loss": 0.72865051, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 2.784224033355713 }, { "auxiliary_loss_clip": 0.01153071, "auxiliary_loss_mlp": 0.01024097, "balance_loss_clip": 1.04520941, "balance_loss_mlp": 1.01670933, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 2.8486836527600614, "language_loss": 0.62283218, "learning_rate": 5.5269251939397576e-08, "loss": 0.64460385, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 2.4868571758270264 }, { "auxiliary_loss_clip": 0.01119859, "auxiliary_loss_mlp": 0.01021878, "balance_loss_clip": 1.0399375, "balance_loss_mlp": 1.01452851, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 2.041828099144128, "language_loss": 0.76562399, "learning_rate": 5.508753738067073e-08, "loss": 0.78704131, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.56257963180542 }, { "auxiliary_loss_clip": 0.0115276, "auxiliary_loss_mlp": 0.01027221, "balance_loss_clip": 1.0446558, "balance_loss_mlp": 1.02045321, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 1.8597029596727213, "language_loss": 0.78984201, "learning_rate": 5.4906117863617875e-08, "loss": 0.81164181, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.505225419998169 }, { "auxiliary_loss_clip": 0.01116757, "auxiliary_loss_mlp": 0.01025082, "balance_loss_clip": 1.04055202, "balance_loss_mlp": 1.01877856, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 1.8905904802750306, "language_loss": 0.78255057, "learning_rate": 5.4724993415760533e-08, "loss": 0.80396897, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.6505308151245117 }, { "auxiliary_loss_clip": 0.01128112, "auxiliary_loss_mlp": 0.00761256, "balance_loss_clip": 1.04175448, "balance_loss_mlp": 1.00036287, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 3.0696474371003415, "language_loss": 0.74441141, "learning_rate": 5.454416406457496e-08, "loss": 0.76330507, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 3.3030738830566406 }, { "auxiliary_loss_clip": 0.01147781, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.04393578, "balance_loss_mlp": 1.01914144, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 2.161976477522184, "language_loss": 0.73557997, "learning_rate": 5.436362983749299e-08, "loss": 0.75731075, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.45060658454895 }, { "auxiliary_loss_clip": 0.01121247, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.04898512, "balance_loss_mlp": 1.02683115, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 1.8377320720767139, "language_loss": 0.64173901, "learning_rate": 5.418339076190137e-08, "loss": 0.66328979, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.5724127292633057 }, { "auxiliary_loss_clip": 0.01130361, "auxiliary_loss_mlp": 0.01025723, "balance_loss_clip": 1.04531145, "balance_loss_mlp": 1.01882648, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 1.7910359483526372, "language_loss": 0.88377035, "learning_rate": 5.400344686514202e-08, "loss": 0.90533113, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.501126289367676 }, { "auxiliary_loss_clip": 0.01148608, "auxiliary_loss_mlp": 0.01025393, "balance_loss_clip": 1.0469898, "balance_loss_mlp": 1.01867223, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 1.735497392725897, "language_loss": 0.66801947, "learning_rate": 5.38237981745131e-08, "loss": 0.68975949, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.5678069591522217 }, { "auxiliary_loss_clip": 0.01150361, "auxiliary_loss_mlp": 0.00760557, "balance_loss_clip": 1.04482603, "balance_loss_mlp": 1.00031722, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 1.6454047507522218, "language_loss": 0.81210983, "learning_rate": 5.364444471726592e-08, "loss": 0.83121902, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 2.4806504249572754 }, { "auxiliary_loss_clip": 0.01147436, "auxiliary_loss_mlp": 0.01029964, "balance_loss_clip": 1.04454458, "balance_loss_mlp": 1.0226922, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 1.886145385643796, "language_loss": 0.7971949, "learning_rate": 5.346538652060939e-08, "loss": 0.81896889, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 2.544504404067993 }, { "auxiliary_loss_clip": 0.01134072, "auxiliary_loss_mlp": 0.01024202, "balance_loss_clip": 1.04620826, "balance_loss_mlp": 1.01746118, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 1.9313693634514075, "language_loss": 0.70133543, "learning_rate": 5.3286623611705994e-08, "loss": 0.72291815, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 2.5068166255950928 }, { "auxiliary_loss_clip": 0.01054436, "auxiliary_loss_mlp": 0.01002842, "balance_loss_clip": 1.00761294, "balance_loss_mlp": 1.00175762, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8063935666207264, "language_loss": 0.60662413, "learning_rate": 5.3108156017673824e-08, "loss": 0.62719691, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.179189682006836 }, { "auxiliary_loss_clip": 0.01137996, "auxiliary_loss_mlp": 0.01022916, "balance_loss_clip": 1.04571247, "balance_loss_mlp": 1.01568568, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.61048877994738, "language_loss": 0.71664333, "learning_rate": 5.2929983765586775e-08, "loss": 0.7382524, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 2.522639751434326 }, { "auxiliary_loss_clip": 0.01161505, "auxiliary_loss_mlp": 0.01027627, "balance_loss_clip": 1.04671192, "balance_loss_mlp": 1.02043271, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 1.908975374021451, "language_loss": 0.62505043, "learning_rate": 5.275210688247278e-08, "loss": 0.64694172, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.501647710800171 }, { "auxiliary_loss_clip": 0.01108437, "auxiliary_loss_mlp": 0.01026802, "balance_loss_clip": 1.04328227, "balance_loss_mlp": 1.02012026, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 2.4261390793196203, "language_loss": 0.84782779, "learning_rate": 5.257452539531604e-08, "loss": 0.86918014, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 3.3678996562957764 }, { "auxiliary_loss_clip": 0.01148736, "auxiliary_loss_mlp": 0.01026689, "balance_loss_clip": 1.04380679, "balance_loss_mlp": 1.01991224, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 1.5029415882509454, "language_loss": 0.68195033, "learning_rate": 5.2397239331055445e-08, "loss": 0.7037046, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 2.528411626815796 }, { "auxiliary_loss_clip": 0.01134682, "auxiliary_loss_mlp": 0.0102382, "balance_loss_clip": 1.04584861, "balance_loss_mlp": 1.01668549, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 2.474510087553388, "language_loss": 0.81125176, "learning_rate": 5.2220248716585036e-08, "loss": 0.83283675, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 3.2860183715820312 }, { "auxiliary_loss_clip": 0.01140257, "auxiliary_loss_mlp": 0.01030105, "balance_loss_clip": 1.04310811, "balance_loss_mlp": 1.02283311, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.149270020831387, "language_loss": 0.75692213, "learning_rate": 5.204355357875445e-08, "loss": 0.77862579, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 3.240177631378174 }, { "auxiliary_loss_clip": 0.0113127, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 1.04245806, "balance_loss_mlp": 1.0187645, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 2.7873887043018235, "language_loss": 0.70580828, "learning_rate": 5.1867153944367584e-08, "loss": 0.7273792, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 2.4915401935577393 }, { "auxiliary_loss_clip": 0.01125966, "auxiliary_loss_mlp": 0.0102342, "balance_loss_clip": 1.04390728, "balance_loss_mlp": 1.01726866, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.621498344115582, "language_loss": 0.73423123, "learning_rate": 5.16910498401848e-08, "loss": 0.75572509, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.5965700149536133 }, { "auxiliary_loss_clip": 0.01163306, "auxiliary_loss_mlp": 0.01023307, "balance_loss_clip": 1.05016613, "balance_loss_mlp": 1.01729846, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 1.9677871855528999, "language_loss": 0.83362234, "learning_rate": 5.151524129292073e-08, "loss": 0.85548848, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.4481592178344727 }, { "auxiliary_loss_clip": 0.01148261, "auxiliary_loss_mlp": 0.01027415, "balance_loss_clip": 1.04580581, "balance_loss_mlp": 1.02006912, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 2.7728504800955083, "language_loss": 0.66440046, "learning_rate": 5.1339728329245155e-08, "loss": 0.68615717, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.5072975158691406 }, { "auxiliary_loss_clip": 0.01167288, "auxiliary_loss_mlp": 0.01027086, "balance_loss_clip": 1.04825735, "balance_loss_mlp": 1.01994812, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 3.873609447709687, "language_loss": 0.79018366, "learning_rate": 5.116451097578367e-08, "loss": 0.81212735, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.4784107208251953 }, { "auxiliary_loss_clip": 0.01119161, "auxiliary_loss_mlp": 0.01024082, "balance_loss_clip": 1.04180408, "balance_loss_mlp": 1.01782405, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 1.6151443284423441, "language_loss": 0.74342847, "learning_rate": 5.0989589259115895e-08, "loss": 0.76486087, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.5633625984191895 }, { "auxiliary_loss_clip": 0.01146548, "auxiliary_loss_mlp": 0.01032822, "balance_loss_clip": 1.04379189, "balance_loss_mlp": 1.02453685, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 1.700742103400936, "language_loss": 0.71393633, "learning_rate": 5.081496320577816e-08, "loss": 0.73573005, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.4816906452178955 }, { "auxiliary_loss_clip": 0.01029997, "auxiliary_loss_mlp": 0.01001485, "balance_loss_clip": 1.00840437, "balance_loss_mlp": 1.00043607, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 0.9314083684430908, "language_loss": 0.61227691, "learning_rate": 5.0640632842260835e-08, "loss": 0.63259172, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.1791553497314453 }, { "auxiliary_loss_clip": 0.01118608, "auxiliary_loss_mlp": 0.00760692, "balance_loss_clip": 1.04487288, "balance_loss_mlp": 1.00029159, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 1.4979989358425179, "language_loss": 0.72614515, "learning_rate": 5.0466598195009426e-08, "loss": 0.74493814, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 2.880998134613037 }, { "auxiliary_loss_clip": 0.01120227, "auxiliary_loss_mlp": 0.01028151, "balance_loss_clip": 1.0444324, "balance_loss_mlp": 1.02154946, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 1.9790486910340126, "language_loss": 0.70076948, "learning_rate": 5.0292859290425036e-08, "loss": 0.72225326, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.5399749279022217 }, { "auxiliary_loss_clip": 0.01162835, "auxiliary_loss_mlp": 0.01024244, "balance_loss_clip": 1.04820991, "balance_loss_mlp": 1.01736569, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 1.8212460020888777, "language_loss": 0.77775645, "learning_rate": 5.011941615486348e-08, "loss": 0.79962718, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.4674181938171387 }, { "auxiliary_loss_clip": 0.01163745, "auxiliary_loss_mlp": 0.01029169, "balance_loss_clip": 1.04799652, "balance_loss_mlp": 1.02260387, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 1.8696310088684718, "language_loss": 0.84142101, "learning_rate": 4.994626881463659e-08, "loss": 0.86335015, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 3.1739439964294434 }, { "auxiliary_loss_clip": 0.0109442, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 1.04034519, "balance_loss_mlp": 1.01826072, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 1.7029864142210447, "language_loss": 0.7085762, "learning_rate": 4.9773417296009814e-08, "loss": 0.72977293, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.648183822631836 }, { "auxiliary_loss_clip": 0.01154028, "auxiliary_loss_mlp": 0.01034364, "balance_loss_clip": 1.04691935, "balance_loss_mlp": 1.02682984, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 2.584485081293265, "language_loss": 0.65569925, "learning_rate": 4.960086162520527e-08, "loss": 0.67758322, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.517779588699341 }, { "auxiliary_loss_clip": 0.01110439, "auxiliary_loss_mlp": 0.01022779, "balance_loss_clip": 1.04256392, "balance_loss_mlp": 1.01588321, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 1.8714804163950711, "language_loss": 0.82659018, "learning_rate": 4.942860182839936e-08, "loss": 0.84792233, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 2.5890183448791504 }, { "auxiliary_loss_clip": 0.01133768, "auxiliary_loss_mlp": 0.01024269, "balance_loss_clip": 1.04354405, "balance_loss_mlp": 1.0178051, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 1.950409155177218, "language_loss": 0.79266948, "learning_rate": 4.925663793172341e-08, "loss": 0.81424987, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.533735752105713 }, { "auxiliary_loss_clip": 0.01032605, "auxiliary_loss_mlp": 0.0075119, "balance_loss_clip": 1.00870395, "balance_loss_mlp": 1.00022125, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.7824167776902249, "language_loss": 0.56576383, "learning_rate": 4.908496996126477e-08, "loss": 0.58360171, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.1496355533599854 }, { "auxiliary_loss_clip": 0.01147202, "auxiliary_loss_mlp": 0.01026757, "balance_loss_clip": 1.04868126, "balance_loss_mlp": 1.01961017, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 1.9292519357996785, "language_loss": 0.7617619, "learning_rate": 4.89135979430646e-08, "loss": 0.78350151, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 2.5138373374938965 }, { "auxiliary_loss_clip": 0.0116477, "auxiliary_loss_mlp": 0.01027298, "balance_loss_clip": 1.04839754, "balance_loss_mlp": 1.01990676, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 1.6544903470416328, "language_loss": 0.85374177, "learning_rate": 4.874252190312078e-08, "loss": 0.87566245, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 2.49051833152771 }, { "auxiliary_loss_clip": 0.01149136, "auxiliary_loss_mlp": 0.01021703, "balance_loss_clip": 1.04599464, "balance_loss_mlp": 1.01438022, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.5506196554919556, "language_loss": 0.64666671, "learning_rate": 4.857174186738477e-08, "loss": 0.66837507, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.577256917953491 }, { "auxiliary_loss_clip": 0.01166749, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.0502038, "balance_loss_mlp": 1.02171016, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 2.5416754432047846, "language_loss": 0.73147076, "learning_rate": 4.840125786176408e-08, "loss": 0.75342429, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 2.468268394470215 }, { "auxiliary_loss_clip": 0.01133408, "auxiliary_loss_mlp": 0.01025171, "balance_loss_clip": 1.04608583, "balance_loss_mlp": 1.018507, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 1.7593041787217303, "language_loss": 0.77466881, "learning_rate": 4.823106991212067e-08, "loss": 0.79625463, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.574995279312134 }, { "auxiliary_loss_clip": 0.01151196, "auxiliary_loss_mlp": 0.01025163, "balance_loss_clip": 1.04577446, "balance_loss_mlp": 1.01869869, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 1.8711473743901335, "language_loss": 0.83435476, "learning_rate": 4.806117804427212e-08, "loss": 0.85611838, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 3.1868414878845215 }, { "auxiliary_loss_clip": 0.01142911, "auxiliary_loss_mlp": 0.01024263, "balance_loss_clip": 1.04370093, "balance_loss_mlp": 1.01671672, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 2.277888452349502, "language_loss": 0.64905047, "learning_rate": 4.7891582283990926e-08, "loss": 0.67072225, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 2.45401930809021 }, { "auxiliary_loss_clip": 0.01118022, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04224992, "balance_loss_mlp": 1.01862395, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 1.5034441095051343, "language_loss": 0.7292164, "learning_rate": 4.772228265700473e-08, "loss": 0.75064886, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 3.3526763916015625 }, { "auxiliary_loss_clip": 0.01150806, "auxiliary_loss_mlp": 0.01025376, "balance_loss_clip": 1.0458349, "balance_loss_mlp": 1.01850677, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 1.9799520635297339, "language_loss": 0.75821394, "learning_rate": 4.75532791889961e-08, "loss": 0.77997577, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 3.2119526863098145 }, { "auxiliary_loss_clip": 0.01148761, "auxiliary_loss_mlp": 0.01025598, "balance_loss_clip": 1.04475999, "balance_loss_mlp": 1.01864815, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 1.7255725392126338, "language_loss": 0.65477359, "learning_rate": 4.738457190560252e-08, "loss": 0.67651719, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 2.4793736934661865 }, { "auxiliary_loss_clip": 0.01113877, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.04706132, "balance_loss_mlp": 1.01977921, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 2.845748143523889, "language_loss": 0.78581214, "learning_rate": 4.721616083241664e-08, "loss": 0.80721563, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.5709803104400635 }, { "auxiliary_loss_clip": 0.0114475, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.0442642, "balance_loss_mlp": 1.02217197, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 1.651694790742778, "language_loss": 0.77375817, "learning_rate": 4.7048045994986684e-08, "loss": 0.79549849, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.5611448287963867 }, { "auxiliary_loss_clip": 0.0115335, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.04844129, "balance_loss_mlp": 1.02097762, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 1.9955302244263122, "language_loss": 0.90985668, "learning_rate": 4.688022741881559e-08, "loss": 0.93167007, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.5574941635131836 }, { "auxiliary_loss_clip": 0.01144913, "auxiliary_loss_mlp": 0.0102206, "balance_loss_clip": 1.04447818, "balance_loss_mlp": 1.01537824, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 1.5016490263663547, "language_loss": 0.75145042, "learning_rate": 4.671270512936076e-08, "loss": 0.77312016, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.4879982471466064 }, { "auxiliary_loss_clip": 0.01114244, "auxiliary_loss_mlp": 0.01022858, "balance_loss_clip": 1.04112589, "balance_loss_mlp": 1.01598525, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.873066469720172, "language_loss": 0.82614625, "learning_rate": 4.6545479152035884e-08, "loss": 0.84751725, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.5767979621887207 }, { "auxiliary_loss_clip": 0.0115066, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.04665244, "balance_loss_mlp": 1.02069712, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 1.873599681868824, "language_loss": 0.75915402, "learning_rate": 4.637854951220821e-08, "loss": 0.78093714, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.477757215499878 }, { "auxiliary_loss_clip": 0.01117717, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.04323828, "balance_loss_mlp": 1.01971197, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 3.3462118164088803, "language_loss": 0.74944675, "learning_rate": 4.621191623520171e-08, "loss": 0.7708894, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.5756447315216064 }, { "auxiliary_loss_clip": 0.01102752, "auxiliary_loss_mlp": 0.01023461, "balance_loss_clip": 1.04175389, "balance_loss_mlp": 1.01677883, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.2280610450207248, "language_loss": 0.84517509, "learning_rate": 4.604557934629372e-08, "loss": 0.8664372, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.6184310913085938 }, { "auxiliary_loss_clip": 0.01131768, "auxiliary_loss_mlp": 0.01026237, "balance_loss_clip": 1.04545546, "balance_loss_mlp": 1.01921272, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 1.6412641832078307, "language_loss": 0.80325556, "learning_rate": 4.587953887071805e-08, "loss": 0.8248356, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.5134949684143066 }, { "auxiliary_loss_clip": 0.01131619, "auxiliary_loss_mlp": 0.01027316, "balance_loss_clip": 1.04200387, "balance_loss_mlp": 1.02030027, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 1.7467057428700739, "language_loss": 0.85794318, "learning_rate": 4.5713794833662554e-08, "loss": 0.87953246, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 3.3173017501831055 }, { "auxiliary_loss_clip": 0.01162132, "auxiliary_loss_mlp": 0.01025967, "balance_loss_clip": 1.0461607, "balance_loss_mlp": 1.01852512, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 1.7987644488154866, "language_loss": 0.63457632, "learning_rate": 4.5548347260270236e-08, "loss": 0.6564573, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.4785733222961426 }, { "auxiliary_loss_clip": 0.01115186, "auxiliary_loss_mlp": 0.01024204, "balance_loss_clip": 1.04157865, "balance_loss_mlp": 1.01776338, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 1.966121539265018, "language_loss": 0.69391418, "learning_rate": 4.538319617564012e-08, "loss": 0.71530807, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.5658674240112305 }, { "auxiliary_loss_clip": 0.01132188, "auxiliary_loss_mlp": 0.01028042, "balance_loss_clip": 1.04178953, "balance_loss_mlp": 1.02092528, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 1.9478903965202232, "language_loss": 0.74476135, "learning_rate": 4.521834160482485e-08, "loss": 0.76636374, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.554039239883423 }, { "auxiliary_loss_clip": 0.01151828, "auxiliary_loss_mlp": 0.01024948, "balance_loss_clip": 1.04632258, "balance_loss_mlp": 1.0182904, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 1.520274059708202, "language_loss": 0.82096165, "learning_rate": 4.5053783572832846e-08, "loss": 0.84272939, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.519944667816162 }, { "auxiliary_loss_clip": 0.01149142, "auxiliary_loss_mlp": 0.01027543, "balance_loss_clip": 1.04608059, "balance_loss_mlp": 1.02027392, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 1.6241241863133782, "language_loss": 0.7635957, "learning_rate": 4.488952210462771e-08, "loss": 0.78536254, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 2.5844430923461914 }, { "auxiliary_loss_clip": 0.01162367, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.04822242, "balance_loss_mlp": 1.0211606, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 1.8383850076205175, "language_loss": 0.85252535, "learning_rate": 4.4725557225127495e-08, "loss": 0.87443054, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.484492778778076 }, { "auxiliary_loss_clip": 0.01148989, "auxiliary_loss_mlp": 0.01021634, "balance_loss_clip": 1.04506743, "balance_loss_mlp": 1.01507759, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 1.5573592336352793, "language_loss": 0.79641294, "learning_rate": 4.456188895920565e-08, "loss": 0.81811917, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 2.6160342693328857 }, { "auxiliary_loss_clip": 0.0116321, "auxiliary_loss_mlp": 0.01024626, "balance_loss_clip": 1.04747319, "balance_loss_mlp": 1.01747358, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 1.969806395182221, "language_loss": 0.85327303, "learning_rate": 4.439851733169031e-08, "loss": 0.8751514, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 2.422485113143921 }, { "auxiliary_loss_clip": 0.01121421, "auxiliary_loss_mlp": 0.01023331, "balance_loss_clip": 1.04379892, "balance_loss_mlp": 1.01656938, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 1.9282969018591154, "language_loss": 0.69955033, "learning_rate": 4.4235442367365204e-08, "loss": 0.72099781, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 2.622904062271118 }, { "auxiliary_loss_clip": 0.01130851, "auxiliary_loss_mlp": 0.01032055, "balance_loss_clip": 1.04132187, "balance_loss_mlp": 1.02499211, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 2.014559490300582, "language_loss": 0.78831863, "learning_rate": 4.4072664090968545e-08, "loss": 0.80994767, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 2.5083165168762207 }, { "auxiliary_loss_clip": 0.01135271, "auxiliary_loss_mlp": 0.01027865, "balance_loss_clip": 1.04197037, "balance_loss_mlp": 1.02069139, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 2.5132918775497615, "language_loss": 0.84829175, "learning_rate": 4.391018252719347e-08, "loss": 0.86992311, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.5433552265167236 }, { "auxiliary_loss_clip": 0.01137964, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.04346514, "balance_loss_mlp": 1.0212307, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 2.0894379620874024, "language_loss": 0.6897161, "learning_rate": 4.374799770068849e-08, "loss": 0.71137983, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 3.2365236282348633 }, { "auxiliary_loss_clip": 0.01146822, "auxiliary_loss_mlp": 0.01024442, "balance_loss_clip": 1.0470407, "balance_loss_mlp": 1.01725674, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 1.9290050966835304, "language_loss": 0.75001127, "learning_rate": 4.358610963605658e-08, "loss": 0.77172387, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 2.5390923023223877 }, { "auxiliary_loss_clip": 0.01164248, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.04797077, "balance_loss_mlp": 1.02139425, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 2.445966841185058, "language_loss": 0.6881392, "learning_rate": 4.342451835785677e-08, "loss": 0.71007031, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 3.327244281768799 }, { "auxiliary_loss_clip": 0.01133896, "auxiliary_loss_mlp": 0.010287, "balance_loss_clip": 1.04370999, "balance_loss_mlp": 1.02200031, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 1.5359818113044614, "language_loss": 0.74871731, "learning_rate": 4.3263223890601665e-08, "loss": 0.7703433, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 3.2248542308807373 }, { "auxiliary_loss_clip": 0.01145367, "auxiliary_loss_mlp": 0.00760088, "balance_loss_clip": 1.04758203, "balance_loss_mlp": 1.00029933, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 1.6981972886785448, "language_loss": 0.79575706, "learning_rate": 4.31022262587597e-08, "loss": 0.81481165, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 2.494405508041382 }, { "auxiliary_loss_clip": 0.01152227, "auxiliary_loss_mlp": 0.0103116, "balance_loss_clip": 1.04856575, "balance_loss_mlp": 1.02352166, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 1.4704919420395046, "language_loss": 0.65921724, "learning_rate": 4.2941525486754225e-08, "loss": 0.68105119, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 2.5131821632385254 }, { "auxiliary_loss_clip": 0.01113249, "auxiliary_loss_mlp": 0.01021642, "balance_loss_clip": 1.04275298, "balance_loss_mlp": 1.01546443, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 1.76058896685364, "language_loss": 0.79721928, "learning_rate": 4.278112159896286e-08, "loss": 0.81856823, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.547502040863037 }, { "auxiliary_loss_clip": 0.01126852, "auxiliary_loss_mlp": 0.01023794, "balance_loss_clip": 1.03972912, "balance_loss_mlp": 1.01749647, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.6983895071419197, "language_loss": 0.67569554, "learning_rate": 4.2621014619719896e-08, "loss": 0.69720203, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.520031690597534 }, { "auxiliary_loss_clip": 0.0103403, "auxiliary_loss_mlp": 0.01002512, "balance_loss_clip": 1.00812817, "balance_loss_mlp": 1.0014571, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.7169455460037275, "language_loss": 0.58633542, "learning_rate": 4.246120457331215e-08, "loss": 0.6067009, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.1474897861480713 }, { "auxiliary_loss_clip": 0.01128615, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.04542637, "balance_loss_mlp": 1.02298307, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 1.7682128259737324, "language_loss": 0.72013676, "learning_rate": 4.2301691483983325e-08, "loss": 0.74172473, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.558661699295044 }, { "auxiliary_loss_clip": 0.01153173, "auxiliary_loss_mlp": 0.01025515, "balance_loss_clip": 1.04620755, "balance_loss_mlp": 1.0186367, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 3.631844610563545, "language_loss": 0.76207221, "learning_rate": 4.214247537593163e-08, "loss": 0.78385907, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.4943387508392334 }, { "auxiliary_loss_clip": 0.0113694, "auxiliary_loss_mlp": 0.01022434, "balance_loss_clip": 1.04392338, "balance_loss_mlp": 1.01568115, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 1.776814523996822, "language_loss": 0.80460912, "learning_rate": 4.1983556273309293e-08, "loss": 0.82620281, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.529265880584717 }, { "auxiliary_loss_clip": 0.01166028, "auxiliary_loss_mlp": 0.01025241, "balance_loss_clip": 1.048877, "balance_loss_mlp": 1.01757574, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 3.1830713768131917, "language_loss": 0.68943739, "learning_rate": 4.182493420022526e-08, "loss": 0.71135008, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.416956901550293 }, { "auxiliary_loss_clip": 0.01115976, "auxiliary_loss_mlp": 0.01021112, "balance_loss_clip": 1.0414474, "balance_loss_mlp": 1.01491308, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 1.7148593750000773, "language_loss": 0.78417742, "learning_rate": 4.166660918074139e-08, "loss": 0.80554831, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.5783751010894775 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.01021417, "balance_loss_clip": 1.04184055, "balance_loss_mlp": 1.01449752, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.493905696052169, "language_loss": 0.73331589, "learning_rate": 4.15085812388758e-08, "loss": 0.75469494, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.5835413932800293 }, { "auxiliary_loss_clip": 0.01136269, "auxiliary_loss_mlp": 0.01026674, "balance_loss_clip": 1.04496646, "balance_loss_mlp": 1.01994455, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 1.6023713129318065, "language_loss": 0.78220081, "learning_rate": 4.135085039860153e-08, "loss": 0.80383027, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 3.37681245803833 }, { "auxiliary_loss_clip": 0.01134294, "auxiliary_loss_mlp": 0.01024299, "balance_loss_clip": 1.04663372, "balance_loss_mlp": 1.01717675, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 2.309138611134046, "language_loss": 0.78335238, "learning_rate": 4.1193416683845906e-08, "loss": 0.80493832, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.6191189289093018 }, { "auxiliary_loss_clip": 0.01126321, "auxiliary_loss_mlp": 0.01023852, "balance_loss_clip": 1.04601216, "balance_loss_mlp": 1.01756036, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 2.0939335935191274, "language_loss": 0.82850063, "learning_rate": 4.103628011849136e-08, "loss": 0.85000235, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.5286953449249268 }, { "auxiliary_loss_clip": 0.01138557, "auxiliary_loss_mlp": 0.01028461, "balance_loss_clip": 1.04751682, "balance_loss_mlp": 1.02120686, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 1.8988982234540235, "language_loss": 0.75823998, "learning_rate": 4.0879440726375506e-08, "loss": 0.77991021, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.5289549827575684 }, { "auxiliary_loss_clip": 0.01130055, "auxiliary_loss_mlp": 0.0102666, "balance_loss_clip": 1.03962898, "balance_loss_mlp": 1.0192275, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.2711319160270365, "language_loss": 0.55539215, "learning_rate": 4.0722898531291074e-08, "loss": 0.57695937, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.5229363441467285 }, { "auxiliary_loss_clip": 0.01139704, "auxiliary_loss_mlp": 0.0102228, "balance_loss_clip": 1.04481351, "balance_loss_mlp": 1.01534486, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.7805401920416644, "language_loss": 0.76766115, "learning_rate": 4.0566653556985295e-08, "loss": 0.78928101, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.556483268737793 }, { "auxiliary_loss_clip": 0.01084069, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 1.04017854, "balance_loss_mlp": 1.01968527, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 2.109982636322407, "language_loss": 0.81316531, "learning_rate": 4.0410705827159886e-08, "loss": 0.83427799, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 2.646326780319214 }, { "auxiliary_loss_clip": 0.01129781, "auxiliary_loss_mlp": 0.01024468, "balance_loss_clip": 1.04100442, "balance_loss_mlp": 1.01778305, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 2.1479696902140804, "language_loss": 0.70819843, "learning_rate": 4.0255055365472356e-08, "loss": 0.72974092, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 2.498863458633423 }, { "auxiliary_loss_clip": 0.01094414, "auxiliary_loss_mlp": 0.01025358, "balance_loss_clip": 1.03764534, "balance_loss_mlp": 1.01878345, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 2.093777265018202, "language_loss": 0.74845409, "learning_rate": 4.009970219553471e-08, "loss": 0.76965183, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 2.622087240219116 }, { "auxiliary_loss_clip": 0.01152881, "auxiliary_loss_mlp": 0.01028458, "balance_loss_clip": 1.0451833, "balance_loss_mlp": 1.02059937, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 2.387110436471645, "language_loss": 0.76505542, "learning_rate": 3.99446463409141e-08, "loss": 0.78686881, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 2.524853467941284 }, { "auxiliary_loss_clip": 0.01151843, "auxiliary_loss_mlp": 0.01026966, "balance_loss_clip": 1.04328144, "balance_loss_mlp": 1.0196408, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 2.225644310535053, "language_loss": 0.6921078, "learning_rate": 3.978988782513215e-08, "loss": 0.71389592, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 2.5081052780151367 }, { "auxiliary_loss_clip": 0.0115173, "auxiliary_loss_mlp": 0.01027586, "balance_loss_clip": 1.04629731, "balance_loss_mlp": 1.02048111, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 1.554936386355212, "language_loss": 0.76177895, "learning_rate": 3.963542667166586e-08, "loss": 0.78357214, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 3.3359320163726807 }, { "auxiliary_loss_clip": 0.01118788, "auxiliary_loss_mlp": 0.01026958, "balance_loss_clip": 1.04516661, "balance_loss_mlp": 1.02004123, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 1.6070979509014245, "language_loss": 0.68308115, "learning_rate": 3.9481262903946486e-08, "loss": 0.70453864, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.556002616882324 }, { "auxiliary_loss_clip": 0.01019924, "auxiliary_loss_mlp": 0.01002138, "balance_loss_clip": 1.00959826, "balance_loss_mlp": 1.00095165, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.7677515167106046, "language_loss": 0.54510033, "learning_rate": 3.932739654536066e-08, "loss": 0.56532097, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.9419124126434326 }, { "auxiliary_loss_clip": 0.01150038, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.04773951, "balance_loss_mlp": 1.02224946, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 2.3552810100565873, "language_loss": 0.74123824, "learning_rate": 3.917382761925014e-08, "loss": 0.76302576, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 3.168048858642578 }, { "auxiliary_loss_clip": 0.01146683, "auxiliary_loss_mlp": 0.01022821, "balance_loss_clip": 1.04670393, "balance_loss_mlp": 1.01608908, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 1.7801962176963786, "language_loss": 0.7928862, "learning_rate": 3.9020556148910754e-08, "loss": 0.81458127, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 2.5362441539764404 }, { "auxiliary_loss_clip": 0.01037689, "auxiliary_loss_mlp": 0.01002015, "balance_loss_clip": 1.008672, "balance_loss_mlp": 1.00088811, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.7112825180991138, "language_loss": 0.56715786, "learning_rate": 3.8867582157593895e-08, "loss": 0.58755481, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 2.9470691680908203 }, { "auxiliary_loss_clip": 0.01149256, "auxiliary_loss_mlp": 0.01029238, "balance_loss_clip": 1.0487318, "balance_loss_mlp": 1.02258277, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 1.56524852068423, "language_loss": 0.76501733, "learning_rate": 3.871490566850544e-08, "loss": 0.78680229, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.5800228118896484 }, { "auxiliary_loss_clip": 0.0112941, "auxiliary_loss_mlp": 0.01024016, "balance_loss_clip": 1.04333377, "balance_loss_mlp": 1.0173521, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 1.5443948146054773, "language_loss": 0.70739436, "learning_rate": 3.856252670480642e-08, "loss": 0.72892869, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.5255258083343506 }, { "auxiliary_loss_clip": 0.01129976, "auxiliary_loss_mlp": 0.01024129, "balance_loss_clip": 1.03986776, "balance_loss_mlp": 1.01770663, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 1.716715487317215, "language_loss": 0.81214583, "learning_rate": 3.841044528961279e-08, "loss": 0.83368689, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.5202105045318604 }, { "auxiliary_loss_clip": 0.01162719, "auxiliary_loss_mlp": 0.0102417, "balance_loss_clip": 1.04586053, "balance_loss_mlp": 1.01724422, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 1.7598535922979848, "language_loss": 0.78586698, "learning_rate": 3.825866144599477e-08, "loss": 0.80773586, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.484039545059204 }, { "auxiliary_loss_clip": 0.01132334, "auxiliary_loss_mlp": 0.01028508, "balance_loss_clip": 1.04174113, "balance_loss_mlp": 1.02148092, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 1.858198884327698, "language_loss": 0.75050503, "learning_rate": 3.8107175196978145e-08, "loss": 0.77211344, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.4888949394226074 }, { "auxiliary_loss_clip": 0.01119674, "auxiliary_loss_mlp": 0.01025018, "balance_loss_clip": 1.04362273, "balance_loss_mlp": 1.01836336, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 1.8088293081396876, "language_loss": 0.77055323, "learning_rate": 3.7955986565542996e-08, "loss": 0.79200017, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.5284740924835205 }, { "auxiliary_loss_clip": 0.01118838, "auxiliary_loss_mlp": 0.01022281, "balance_loss_clip": 1.04135907, "balance_loss_mlp": 1.01540899, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 2.0045543764491462, "language_loss": 0.68187559, "learning_rate": 3.780509557462497e-08, "loss": 0.70328677, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.6729462146759033 }, { "auxiliary_loss_clip": 0.01129724, "auxiliary_loss_mlp": 0.01026222, "balance_loss_clip": 1.04095507, "balance_loss_mlp": 1.01927173, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.5917918748338227, "language_loss": 0.75544906, "learning_rate": 3.765450224711375e-08, "loss": 0.77700853, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.5744264125823975 }, { "auxiliary_loss_clip": 0.01127533, "auxiliary_loss_mlp": 0.01025978, "balance_loss_clip": 1.04395938, "balance_loss_mlp": 1.01873863, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 1.5992913075300974, "language_loss": 0.79398316, "learning_rate": 3.750420660585396e-08, "loss": 0.81551826, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.562063694000244 }, { "auxiliary_loss_clip": 0.01162659, "auxiliary_loss_mlp": 0.01025244, "balance_loss_clip": 1.04797268, "balance_loss_mlp": 1.01876771, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 1.6110346793902586, "language_loss": 0.79321873, "learning_rate": 3.735420867364603e-08, "loss": 0.81509781, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 3.1933391094207764 }, { "auxiliary_loss_clip": 0.01084481, "auxiliary_loss_mlp": 0.01020883, "balance_loss_clip": 1.03594887, "balance_loss_mlp": 1.01472259, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.7754371540725589, "language_loss": 0.61849916, "learning_rate": 3.7204508473244186e-08, "loss": 0.63955283, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 2.745845317840576 }, { "auxiliary_loss_clip": 0.01072868, "auxiliary_loss_mlp": 0.01023915, "balance_loss_clip": 1.03962922, "balance_loss_mlp": 1.01719773, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 1.5925048918621816, "language_loss": 0.69172627, "learning_rate": 3.7055106027357395e-08, "loss": 0.71269417, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.7217726707458496 }, { "auxiliary_loss_clip": 0.01143672, "auxiliary_loss_mlp": 0.01024978, "balance_loss_clip": 1.04392219, "balance_loss_mlp": 1.01817143, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 2.260502111371493, "language_loss": 0.71852183, "learning_rate": 3.690600135865063e-08, "loss": 0.74020839, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 2.5672714710235596 }, { "auxiliary_loss_clip": 0.01010678, "auxiliary_loss_mlp": 0.00999595, "balance_loss_clip": 1.0077529, "balance_loss_mlp": 0.99851036, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.7933200997460268, "language_loss": 0.58128268, "learning_rate": 3.675719448974246e-08, "loss": 0.60138541, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 3.2249016761779785 }, { "auxiliary_loss_clip": 0.01102067, "auxiliary_loss_mlp": 0.00760849, "balance_loss_clip": 1.04041195, "balance_loss_mlp": 1.00031614, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 2.198367804024523, "language_loss": 0.59603286, "learning_rate": 3.6608685443207054e-08, "loss": 0.61466205, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.614950656890869 }, { "auxiliary_loss_clip": 0.01121774, "auxiliary_loss_mlp": 0.01026739, "balance_loss_clip": 1.04263377, "balance_loss_mlp": 1.02048326, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.332072710913994, "language_loss": 0.66374218, "learning_rate": 3.646047424157306e-08, "loss": 0.68522727, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 2.5535805225372314 }, { "auxiliary_loss_clip": 0.01137539, "auxiliary_loss_mlp": 0.01028693, "balance_loss_clip": 1.04681182, "balance_loss_mlp": 1.02114105, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 2.483121316754697, "language_loss": 0.68977094, "learning_rate": 3.631256090732382e-08, "loss": 0.71143329, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 2.533738851547241 }, { "auxiliary_loss_clip": 0.01121921, "auxiliary_loss_mlp": 0.01028158, "balance_loss_clip": 1.0443145, "balance_loss_mlp": 1.02128601, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 1.7813442178698875, "language_loss": 0.82422233, "learning_rate": 3.6164945462897833e-08, "loss": 0.84572303, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 2.589723587036133 }, { "auxiliary_loss_clip": 0.01147385, "auxiliary_loss_mlp": 0.00760618, "balance_loss_clip": 1.04708707, "balance_loss_mlp": 1.00034666, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 1.6799163954114396, "language_loss": 0.75916636, "learning_rate": 3.6017627930687856e-08, "loss": 0.7782464, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 2.4951610565185547 }, { "auxiliary_loss_clip": 0.01100999, "auxiliary_loss_mlp": 0.01024301, "balance_loss_clip": 1.03777122, "balance_loss_mlp": 1.01744914, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 1.9121151126637648, "language_loss": 0.77189875, "learning_rate": 3.587060833304267e-08, "loss": 0.79315174, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 2.597127676010132 }, { "auxiliary_loss_clip": 0.01153732, "auxiliary_loss_mlp": 0.01027121, "balance_loss_clip": 1.04806757, "balance_loss_mlp": 1.02020097, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 2.3592500473333713, "language_loss": 0.64244378, "learning_rate": 3.5723886692264225e-08, "loss": 0.66425234, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 3.2147905826568604 }, { "auxiliary_loss_clip": 0.0113429, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 1.04296708, "balance_loss_mlp": 1.01850986, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 9.858099993651397, "language_loss": 0.62082922, "learning_rate": 3.557746303061071e-08, "loss": 0.64242256, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.6179165840148926 }, { "auxiliary_loss_clip": 0.01132442, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.04235828, "balance_loss_mlp": 1.02281618, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 1.6024473452230574, "language_loss": 0.72426558, "learning_rate": 3.543133737029391e-08, "loss": 0.74588811, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 3.357239007949829 }, { "auxiliary_loss_clip": 0.01152462, "auxiliary_loss_mlp": 0.01027765, "balance_loss_clip": 1.04619014, "balance_loss_mlp": 1.02049661, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 1.6996353717873292, "language_loss": 0.69079733, "learning_rate": 3.5285509733481214e-08, "loss": 0.71259964, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 3.253981113433838 }, { "auxiliary_loss_clip": 0.01146853, "auxiliary_loss_mlp": 0.01027435, "balance_loss_clip": 1.04561555, "balance_loss_mlp": 1.02029443, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 1.7293718737511643, "language_loss": 0.76270008, "learning_rate": 3.513998014229469e-08, "loss": 0.7844429, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.4833109378814697 }, { "auxiliary_loss_clip": 0.01133056, "auxiliary_loss_mlp": 0.01027558, "balance_loss_clip": 1.04426491, "balance_loss_mlp": 1.02051544, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 2.088423072095952, "language_loss": 0.86368865, "learning_rate": 3.499474861881069e-08, "loss": 0.8852948, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 2.4863979816436768 }, { "auxiliary_loss_clip": 0.01091146, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.03993261, "balance_loss_mlp": 1.01995659, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 1.807713037251671, "language_loss": 0.68167871, "learning_rate": 3.4849815185061136e-08, "loss": 0.70285606, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.636234760284424 }, { "auxiliary_loss_clip": 0.01147066, "auxiliary_loss_mlp": 0.01025486, "balance_loss_clip": 1.04354715, "balance_loss_mlp": 1.01947236, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 2.239326485236803, "language_loss": 0.75779176, "learning_rate": 3.470517986303223e-08, "loss": 0.77951729, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.4704041481018066 }, { "auxiliary_loss_clip": 0.01121805, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.0446887, "balance_loss_mlp": 1.02550197, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.6292308370657256, "language_loss": 0.78878438, "learning_rate": 3.4560842674664856e-08, "loss": 0.81032884, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.543653726577759 }, { "auxiliary_loss_clip": 0.01148672, "auxiliary_loss_mlp": 0.01025582, "balance_loss_clip": 1.0433445, "balance_loss_mlp": 1.01882291, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 1.7144492391606259, "language_loss": 0.75177646, "learning_rate": 3.441680364185506e-08, "loss": 0.77351904, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.5238759517669678 }, { "auxiliary_loss_clip": 0.0113727, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.04598498, "balance_loss_mlp": 1.02473855, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 2.5039899843402975, "language_loss": 0.7452696, "learning_rate": 3.427306278645314e-08, "loss": 0.76696479, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.5204057693481445 }, { "auxiliary_loss_clip": 0.01106898, "auxiliary_loss_mlp": 0.01024238, "balance_loss_clip": 1.04308295, "balance_loss_mlp": 1.01671553, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 4.664057847075759, "language_loss": 0.72968, "learning_rate": 3.4129620130264767e-08, "loss": 0.7509914, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.5876123905181885 }, { "auxiliary_loss_clip": 0.0113664, "auxiliary_loss_mlp": 0.00760306, "balance_loss_clip": 1.04321957, "balance_loss_mlp": 1.00029588, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 2.052812771655024, "language_loss": 0.77630752, "learning_rate": 3.398647569505009e-08, "loss": 0.795277, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.516140937805176 }, { "auxiliary_loss_clip": 0.01126465, "auxiliary_loss_mlp": 0.01026059, "balance_loss_clip": 1.04456389, "balance_loss_mlp": 1.01883483, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 2.2415034742228874, "language_loss": 0.75060242, "learning_rate": 3.384362950252373e-08, "loss": 0.77212763, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.5558602809906006 }, { "auxiliary_loss_clip": 0.01130706, "auxiliary_loss_mlp": 0.01026647, "balance_loss_clip": 1.04146719, "balance_loss_mlp": 1.01991749, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 2.4069058063997915, "language_loss": 0.57203591, "learning_rate": 3.3701081574355473e-08, "loss": 0.59360945, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 3.445439577102661 }, { "auxiliary_loss_clip": 0.01033911, "auxiliary_loss_mlp": 0.01003373, "balance_loss_clip": 1.0097481, "balance_loss_mlp": 1.00224018, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6389987282097837, "language_loss": 0.51684117, "learning_rate": 3.3558831932169796e-08, "loss": 0.53721404, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.189997434616089 }, { "auxiliary_loss_clip": 0.01147425, "auxiliary_loss_mlp": 0.01022977, "balance_loss_clip": 1.04477513, "balance_loss_mlp": 1.01616096, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 2.1358149765852743, "language_loss": 0.88235438, "learning_rate": 3.341688059754588e-08, "loss": 0.90405846, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.542449712753296 }, { "auxiliary_loss_clip": 0.01125119, "auxiliary_loss_mlp": 0.00760281, "balance_loss_clip": 1.04115915, "balance_loss_mlp": 1.00030899, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 3.133036843827062, "language_loss": 0.78208011, "learning_rate": 3.327522759201762e-08, "loss": 0.80093408, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.6135764122009277 }, { "auxiliary_loss_clip": 0.01118445, "auxiliary_loss_mlp": 0.01028386, "balance_loss_clip": 1.04257226, "balance_loss_mlp": 1.02132905, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 2.0289915753863257, "language_loss": 0.66874892, "learning_rate": 3.313387293707359e-08, "loss": 0.6902172, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.5888869762420654 }, { "auxiliary_loss_clip": 0.01116694, "auxiliary_loss_mlp": 0.01025715, "balance_loss_clip": 1.04335523, "balance_loss_mlp": 1.01841009, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 4.414891000974369, "language_loss": 0.68271494, "learning_rate": 3.29928166541571e-08, "loss": 0.70413899, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.554841995239258 }, { "auxiliary_loss_clip": 0.01125903, "auxiliary_loss_mlp": 0.01025057, "balance_loss_clip": 1.04370999, "balance_loss_mlp": 1.01821113, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 1.8408814622775624, "language_loss": 0.80458856, "learning_rate": 3.2852058764666346e-08, "loss": 0.82609808, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 2.5464818477630615 }, { "auxiliary_loss_clip": 0.0111204, "auxiliary_loss_mlp": 0.01025774, "balance_loss_clip": 1.04529166, "balance_loss_mlp": 1.0195483, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 2.1687854063073244, "language_loss": 0.68360722, "learning_rate": 3.2711599289954264e-08, "loss": 0.70498538, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 2.6829347610473633 }, { "auxiliary_loss_clip": 0.01094991, "auxiliary_loss_mlp": 0.01027016, "balance_loss_clip": 1.04180741, "balance_loss_mlp": 1.02002478, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 1.6917853199192072, "language_loss": 0.77689737, "learning_rate": 3.257143825132847e-08, "loss": 0.79811746, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 2.6205673217773438 }, { "auxiliary_loss_clip": 0.01134252, "auxiliary_loss_mlp": 0.0102459, "balance_loss_clip": 1.04340935, "balance_loss_mlp": 1.01831412, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 1.8382499479510535, "language_loss": 0.75969118, "learning_rate": 3.243157567005106e-08, "loss": 0.78127962, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 2.5508577823638916 }, { "auxiliary_loss_clip": 0.01169893, "auxiliary_loss_mlp": 0.01032921, "balance_loss_clip": 1.0530479, "balance_loss_mlp": 1.0257417, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 2.0899974014911034, "language_loss": 0.63829231, "learning_rate": 3.2292011567339296e-08, "loss": 0.6603204, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.4419314861297607 }, { "auxiliary_loss_clip": 0.01148638, "auxiliary_loss_mlp": 0.00760342, "balance_loss_clip": 1.04454553, "balance_loss_mlp": 1.00033689, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.1562052841905106, "language_loss": 0.55630159, "learning_rate": 3.21527459643649e-08, "loss": 0.57539141, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 3.3025741577148438 }, { "auxiliary_loss_clip": 0.01151573, "auxiliary_loss_mlp": 0.01025829, "balance_loss_clip": 1.04673398, "balance_loss_mlp": 1.01954961, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 1.742933173610097, "language_loss": 0.74262267, "learning_rate": 3.2013778882254536e-08, "loss": 0.76439673, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 2.516793727874756 }, { "auxiliary_loss_clip": 0.0113971, "auxiliary_loss_mlp": 0.01030192, "balance_loss_clip": 1.04354227, "balance_loss_mlp": 1.02341247, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 1.7741789508996997, "language_loss": 0.76081747, "learning_rate": 3.1875110342088676e-08, "loss": 0.78251648, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 3.3104164600372314 }, { "auxiliary_loss_clip": 0.01129899, "auxiliary_loss_mlp": 0.01023114, "balance_loss_clip": 1.04460156, "balance_loss_mlp": 1.01666522, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 1.699922696872583, "language_loss": 0.65175974, "learning_rate": 3.1736740364904035e-08, "loss": 0.6732899, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.5478928089141846 }, { "auxiliary_loss_clip": 0.01104605, "auxiliary_loss_mlp": 0.00760612, "balance_loss_clip": 1.04174995, "balance_loss_mlp": 1.00031769, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 2.263865333191504, "language_loss": 0.77223802, "learning_rate": 3.159866897169094e-08, "loss": 0.79089022, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.5768275260925293 }, { "auxiliary_loss_clip": 0.01125341, "auxiliary_loss_mlp": 0.01030128, "balance_loss_clip": 1.04384315, "balance_loss_mlp": 1.02251673, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 1.8162390693322277, "language_loss": 0.75274599, "learning_rate": 3.146089618339487e-08, "loss": 0.77430069, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 3.286191701889038 }, { "auxiliary_loss_clip": 0.01121783, "auxiliary_loss_mlp": 0.01024994, "balance_loss_clip": 1.0420121, "balance_loss_mlp": 1.01776361, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 1.7580521866105308, "language_loss": 0.67982447, "learning_rate": 3.132342202091554e-08, "loss": 0.70129228, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 2.604379177093506 }, { "auxiliary_loss_clip": 0.01163072, "auxiliary_loss_mlp": 0.01023795, "balance_loss_clip": 1.04587913, "balance_loss_mlp": 1.01689577, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 2.3163921762467674, "language_loss": 0.68567705, "learning_rate": 3.1186246505107595e-08, "loss": 0.7075457, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.465827465057373 }, { "auxiliary_loss_clip": 0.01151191, "auxiliary_loss_mlp": 0.01023407, "balance_loss_clip": 1.04845119, "balance_loss_mlp": 1.01624215, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 1.8715530446747268, "language_loss": 0.83447945, "learning_rate": 3.104936965678084e-08, "loss": 0.85622549, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.476027250289917 }, { "auxiliary_loss_clip": 0.0114921, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04488873, "balance_loss_mlp": 1.01803815, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 1.7879559228512736, "language_loss": 0.81843674, "learning_rate": 3.091279149669956e-08, "loss": 0.84018463, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.500020742416382 }, { "auxiliary_loss_clip": 0.01147356, "auxiliary_loss_mlp": 0.00760474, "balance_loss_clip": 1.04412127, "balance_loss_mlp": 1.00026524, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 1.9315327949157024, "language_loss": 0.73934829, "learning_rate": 3.0776512045581624e-08, "loss": 0.75842655, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.505927562713623 }, { "auxiliary_loss_clip": 0.01128865, "auxiliary_loss_mlp": 0.01025206, "balance_loss_clip": 1.04384089, "balance_loss_mlp": 1.01733851, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 1.8808828668393796, "language_loss": 0.77449566, "learning_rate": 3.0640531324101384e-08, "loss": 0.79603636, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.5965847969055176 }, { "auxiliary_loss_clip": 0.01153507, "auxiliary_loss_mlp": 0.01025691, "balance_loss_clip": 1.04923129, "balance_loss_mlp": 1.01853824, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 1.530999888074661, "language_loss": 0.76049536, "learning_rate": 3.0504849352886554e-08, "loss": 0.78228736, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.492549180984497 }, { "auxiliary_loss_clip": 0.01149181, "auxiliary_loss_mlp": 0.01025617, "balance_loss_clip": 1.04730916, "balance_loss_mlp": 1.01850605, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 2.473424405570833, "language_loss": 0.7173934, "learning_rate": 3.036946615252023e-08, "loss": 0.73914135, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.4889395236968994 }, { "auxiliary_loss_clip": 0.01136824, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.04531407, "balance_loss_mlp": 1.02638626, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 3.7906350228915198, "language_loss": 0.67041469, "learning_rate": 3.0234381743539984e-08, "loss": 0.69211709, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.616807699203491 }, { "auxiliary_loss_clip": 0.01140782, "auxiliary_loss_mlp": 0.01027509, "balance_loss_clip": 1.0441072, "balance_loss_mlp": 1.02090776, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 2.0142482925192153, "language_loss": 0.79791063, "learning_rate": 3.0099596146437863e-08, "loss": 0.81959355, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 3.2165918350219727 }, { "auxiliary_loss_clip": 0.0105463, "auxiliary_loss_mlp": 0.01003958, "balance_loss_clip": 1.00776398, "balance_loss_mlp": 1.00284922, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7697381012311698, "language_loss": 0.60093939, "learning_rate": 2.996510938166086e-08, "loss": 0.62152529, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.132199287414551 }, { "auxiliary_loss_clip": 0.01147081, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.04651105, "balance_loss_mlp": 1.02726912, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 1.8024686988521683, "language_loss": 0.73806316, "learning_rate": 2.983092146960997e-08, "loss": 0.75987482, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.486224889755249 }, { "auxiliary_loss_clip": 0.01135064, "auxiliary_loss_mlp": 0.01027304, "balance_loss_clip": 1.04281354, "balance_loss_mlp": 1.02004457, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 2.084563397428503, "language_loss": 0.80198222, "learning_rate": 2.9697032430642256e-08, "loss": 0.8236059, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.5300962924957275 }, { "auxiliary_loss_clip": 0.01157573, "auxiliary_loss_mlp": 0.01025345, "balance_loss_clip": 1.04597378, "balance_loss_mlp": 1.01848495, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 2.0821897676764998, "language_loss": 0.73474622, "learning_rate": 2.9563442285067906e-08, "loss": 0.75657535, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.42889404296875 }, { "auxiliary_loss_clip": 0.0115291, "auxiliary_loss_mlp": 0.01024495, "balance_loss_clip": 1.04913282, "balance_loss_mlp": 1.01691651, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 1.78074238174984, "language_loss": 0.79711008, "learning_rate": 2.943015105315294e-08, "loss": 0.81888413, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.5479679107666016 }, { "auxiliary_loss_clip": 0.01109656, "auxiliary_loss_mlp": 0.01031744, "balance_loss_clip": 1.0415678, "balance_loss_mlp": 1.02467251, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 2.2084693325018487, "language_loss": 0.66389579, "learning_rate": 2.929715875511718e-08, "loss": 0.68530977, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.6138758659362793 }, { "auxiliary_loss_clip": 0.01148309, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.04210925, "balance_loss_mlp": 1.0243361, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 1.7670058736927796, "language_loss": 0.6969589, "learning_rate": 2.9164465411135375e-08, "loss": 0.71875596, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 2.505213975906372 }, { "auxiliary_loss_clip": 0.01151732, "auxiliary_loss_mlp": 0.01029083, "balance_loss_clip": 1.04932225, "balance_loss_mlp": 1.02222288, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 1.8234716944687261, "language_loss": 0.80500925, "learning_rate": 2.9032071041337426e-08, "loss": 0.82681739, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 2.4810664653778076 }, { "auxiliary_loss_clip": 0.01128224, "auxiliary_loss_mlp": 0.01024633, "balance_loss_clip": 1.04427028, "balance_loss_mlp": 1.01792192, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 1.5489906720755573, "language_loss": 0.72767842, "learning_rate": 2.889997566580704e-08, "loss": 0.74920702, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 2.505730628967285 }, { "auxiliary_loss_clip": 0.01165359, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.04741037, "balance_loss_mlp": 1.02155268, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 2.2171724764294285, "language_loss": 0.70444131, "learning_rate": 2.8768179304583086e-08, "loss": 0.72638321, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.4875340461730957 }, { "auxiliary_loss_clip": 0.01123962, "auxiliary_loss_mlp": 0.0102615, "balance_loss_clip": 1.04714239, "balance_loss_mlp": 1.01964974, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.528788223455236, "language_loss": 0.73573351, "learning_rate": 2.8636681977659117e-08, "loss": 0.75723463, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 2.5657408237457275 }, { "auxiliary_loss_clip": 0.01104558, "auxiliary_loss_mlp": 0.01031958, "balance_loss_clip": 1.04321361, "balance_loss_mlp": 1.02509737, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 1.9706916207313943, "language_loss": 0.78007978, "learning_rate": 2.850548370498318e-08, "loss": 0.80144489, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 3.3997416496276855 }, { "auxiliary_loss_clip": 0.01145991, "auxiliary_loss_mlp": 0.01023069, "balance_loss_clip": 1.04284871, "balance_loss_mlp": 1.01674843, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 1.6178602827214181, "language_loss": 0.71245843, "learning_rate": 2.8374584506457798e-08, "loss": 0.73414904, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 3.321948528289795 }, { "auxiliary_loss_clip": 0.01135068, "auxiliary_loss_mlp": 0.01027144, "balance_loss_clip": 1.04606175, "balance_loss_mlp": 1.01997328, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 2.068504587678788, "language_loss": 0.67222464, "learning_rate": 2.824398440193998e-08, "loss": 0.69384676, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.53171706199646 }, { "auxiliary_loss_clip": 0.01099065, "auxiliary_loss_mlp": 0.01024462, "balance_loss_clip": 1.03951693, "balance_loss_mlp": 1.0173986, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 1.9560648701428318, "language_loss": 0.71251249, "learning_rate": 2.811368341124232e-08, "loss": 0.73374772, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 3.342376470565796 }, { "auxiliary_loss_clip": 0.01134411, "auxiliary_loss_mlp": 0.01022721, "balance_loss_clip": 1.04384851, "balance_loss_mlp": 1.01624227, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 2.0136728404733315, "language_loss": 0.67911029, "learning_rate": 2.7983681554131222e-08, "loss": 0.70068157, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.5555262565612793 }, { "auxiliary_loss_clip": 0.01131899, "auxiliary_loss_mlp": 0.01027411, "balance_loss_clip": 1.04265809, "balance_loss_mlp": 1.02030635, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 2.0583865246900537, "language_loss": 0.7023648, "learning_rate": 2.7853978850327365e-08, "loss": 0.72395796, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 2.5009889602661133 }, { "auxiliary_loss_clip": 0.01120222, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.04580867, "balance_loss_mlp": 1.02137113, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 1.836767995519403, "language_loss": 0.87437701, "learning_rate": 2.7724575319507225e-08, "loss": 0.89585507, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.6226418018341064 }, { "auxiliary_loss_clip": 0.01148595, "auxiliary_loss_mlp": 0.01028096, "balance_loss_clip": 1.04393983, "balance_loss_mlp": 1.02165556, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 1.7593854987719415, "language_loss": 0.76858532, "learning_rate": 2.759547098130044e-08, "loss": 0.79035223, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.5076053142547607 }, { "auxiliary_loss_clip": 0.01161402, "auxiliary_loss_mlp": 0.01026061, "balance_loss_clip": 1.04713941, "balance_loss_mlp": 1.01995742, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 1.920853285690736, "language_loss": 0.76929057, "learning_rate": 2.746666585529267e-08, "loss": 0.79116517, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 2.4890506267547607 }, { "auxiliary_loss_clip": 0.0114209, "auxiliary_loss_mlp": 0.01022091, "balance_loss_clip": 1.04433906, "balance_loss_mlp": 1.01498318, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.074887690392293, "language_loss": 0.73870969, "learning_rate": 2.73381599610234e-08, "loss": 0.76035148, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.639042854309082 }, { "auxiliary_loss_clip": 0.01143165, "auxiliary_loss_mlp": 0.01024072, "balance_loss_clip": 1.04269743, "balance_loss_mlp": 1.01741409, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 1.9831387733387673, "language_loss": 0.71669292, "learning_rate": 2.7209953317987033e-08, "loss": 0.73836529, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.5367581844329834 }, { "auxiliary_loss_clip": 0.01149749, "auxiliary_loss_mlp": 0.01020216, "balance_loss_clip": 1.04625344, "balance_loss_mlp": 1.01309597, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 1.8415623170070479, "language_loss": 0.78201765, "learning_rate": 2.7082045945631793e-08, "loss": 0.80371726, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.5902230739593506 }, { "auxiliary_loss_clip": 0.01115048, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.04270387, "balance_loss_mlp": 1.02004993, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 2.0322578596917427, "language_loss": 0.69408488, "learning_rate": 2.6954437863361712e-08, "loss": 0.71550512, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.533811092376709 }, { "auxiliary_loss_clip": 0.01097011, "auxiliary_loss_mlp": 0.01021375, "balance_loss_clip": 1.04017842, "balance_loss_mlp": 1.01506305, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 2.0920236436505624, "language_loss": 0.7055819, "learning_rate": 2.6827129090534862e-08, "loss": 0.72676575, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 2.6666183471679688 }, { "auxiliary_loss_clip": 0.01130675, "auxiliary_loss_mlp": 0.01022661, "balance_loss_clip": 1.04381776, "balance_loss_mlp": 1.0161252, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 2.024898480946682, "language_loss": 0.77630615, "learning_rate": 2.670011964646335e-08, "loss": 0.79783952, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 3.335475206375122 }, { "auxiliary_loss_clip": 0.01082419, "auxiliary_loss_mlp": 0.01025014, "balance_loss_clip": 1.03480339, "balance_loss_mlp": 1.01818609, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 1.8714646218424649, "language_loss": 0.67868996, "learning_rate": 2.657340955041487e-08, "loss": 0.69976425, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.6449859142303467 }, { "auxiliary_loss_clip": 0.01131055, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.04510069, "balance_loss_mlp": 1.02200317, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 1.8926893590468887, "language_loss": 0.71491122, "learning_rate": 2.6446998821611167e-08, "loss": 0.73651785, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.5684080123901367 }, { "auxiliary_loss_clip": 0.0110877, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.04281676, "balance_loss_mlp": 1.02230239, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 2.4701581353210753, "language_loss": 0.7157073, "learning_rate": 2.6320887479228228e-08, "loss": 0.73708564, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.547853469848633 }, { "auxiliary_loss_clip": 0.01138222, "auxiliary_loss_mlp": 0.01022155, "balance_loss_clip": 1.04493666, "balance_loss_mlp": 1.01541114, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.295392077443322, "language_loss": 0.72826374, "learning_rate": 2.619507554239786e-08, "loss": 0.74986744, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 2.570667266845703 }, { "auxiliary_loss_clip": 0.01132942, "auxiliary_loss_mlp": 0.01024651, "balance_loss_clip": 1.04412162, "balance_loss_mlp": 1.01779652, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 1.545141823646415, "language_loss": 0.69540977, "learning_rate": 2.606956303020502e-08, "loss": 0.7169857, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.5605649948120117 }, { "auxiliary_loss_clip": 0.01150465, "auxiliary_loss_mlp": 0.0102552, "balance_loss_clip": 1.04760814, "balance_loss_mlp": 1.01857591, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 1.8068983799193632, "language_loss": 0.84060031, "learning_rate": 2.5944349961690036e-08, "loss": 0.86236018, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 2.4686028957366943 }, { "auxiliary_loss_clip": 0.01121914, "auxiliary_loss_mlp": 0.01023832, "balance_loss_clip": 1.04303932, "balance_loss_mlp": 1.01698303, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.7650757846732053, "language_loss": 0.72842133, "learning_rate": 2.581943635584749e-08, "loss": 0.74987876, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 2.7076985836029053 }, { "auxiliary_loss_clip": 0.01127932, "auxiliary_loss_mlp": 0.01031542, "balance_loss_clip": 1.04428411, "balance_loss_mlp": 1.02475643, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 1.4812359540192912, "language_loss": 0.653153, "learning_rate": 2.569482223162689e-08, "loss": 0.67474771, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 2.68632173538208 }, { "auxiliary_loss_clip": 0.01149057, "auxiliary_loss_mlp": 0.01025605, "balance_loss_clip": 1.04560804, "balance_loss_mlp": 1.01855659, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.6685831155075352, "language_loss": 0.72111511, "learning_rate": 2.5570507607932e-08, "loss": 0.74286175, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 2.5133769512176514 }, { "auxiliary_loss_clip": 0.01151071, "auxiliary_loss_mlp": 0.01026718, "balance_loss_clip": 1.04508543, "balance_loss_mlp": 1.01955962, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 3.7973890661965215, "language_loss": 0.64131904, "learning_rate": 2.54464925036213e-08, "loss": 0.6630969, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.4693918228149414 }, { "auxiliary_loss_clip": 0.01148256, "auxiliary_loss_mlp": 0.0102466, "balance_loss_clip": 1.04694366, "balance_loss_mlp": 1.01777303, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 1.761703395028214, "language_loss": 0.60960472, "learning_rate": 2.532277693750773e-08, "loss": 0.63133383, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 2.561464309692383 }, { "auxiliary_loss_clip": 0.01103562, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 1.04317939, "balance_loss_mlp": 1.01838017, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 2.0120881366925656, "language_loss": 0.75609457, "learning_rate": 2.5199360928358948e-08, "loss": 0.77738225, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 3.3993828296661377 }, { "auxiliary_loss_clip": 0.0113914, "auxiliary_loss_mlp": 0.00760052, "balance_loss_clip": 1.04289854, "balance_loss_mlp": 1.0002929, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 1.5820022184686973, "language_loss": 0.86942559, "learning_rate": 2.507624449489665e-08, "loss": 0.8884176, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 3.3025944232940674 }, { "auxiliary_loss_clip": 0.01134441, "auxiliary_loss_mlp": 0.01024182, "balance_loss_clip": 1.04499483, "balance_loss_mlp": 1.01739287, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 2.1207602743605447, "language_loss": 0.65253234, "learning_rate": 2.495342765579811e-08, "loss": 0.67411858, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.513471841812134 }, { "auxiliary_loss_clip": 0.011021, "auxiliary_loss_mlp": 0.01025021, "balance_loss_clip": 1.04042482, "balance_loss_mlp": 1.0178324, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 1.8478054926460676, "language_loss": 0.70757937, "learning_rate": 2.4830910429693984e-08, "loss": 0.7288506, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 3.3309640884399414 }, { "auxiliary_loss_clip": 0.01162709, "auxiliary_loss_mlp": 0.01025653, "balance_loss_clip": 1.04718328, "balance_loss_mlp": 1.01855707, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 1.801422876623645, "language_loss": 0.7967304, "learning_rate": 2.470869283517052e-08, "loss": 0.81861401, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 2.443518877029419 }, { "auxiliary_loss_clip": 0.01144583, "auxiliary_loss_mlp": 0.01027009, "balance_loss_clip": 1.04535592, "balance_loss_mlp": 1.02015197, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.6857098925852978, "language_loss": 0.77014732, "learning_rate": 2.458677489076777e-08, "loss": 0.79186326, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.524535894393921 }, { "auxiliary_loss_clip": 0.01141022, "auxiliary_loss_mlp": 0.01028338, "balance_loss_clip": 1.04459929, "balance_loss_mlp": 1.02217209, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 1.649405978975886, "language_loss": 0.82723534, "learning_rate": 2.446515661498072e-08, "loss": 0.84892893, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.497105121612549 }, { "auxiliary_loss_clip": 0.01091358, "auxiliary_loss_mlp": 0.01020692, "balance_loss_clip": 1.03982162, "balance_loss_mlp": 1.01407909, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 1.9501716516712877, "language_loss": 0.73938251, "learning_rate": 2.434383802625861e-08, "loss": 0.76050305, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.6496002674102783 }, { "auxiliary_loss_clip": 0.01120976, "auxiliary_loss_mlp": 0.01022001, "balance_loss_clip": 1.04071927, "balance_loss_mlp": 1.01514053, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 1.9855222520895315, "language_loss": 0.73785037, "learning_rate": 2.4222819143005168e-08, "loss": 0.7592802, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 2.5636677742004395 }, { "auxiliary_loss_clip": 0.01161615, "auxiliary_loss_mlp": 0.01022693, "balance_loss_clip": 1.04880071, "balance_loss_mlp": 1.0162015, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 1.646995455386403, "language_loss": 0.80710578, "learning_rate": 2.4102099983579706e-08, "loss": 0.82894886, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.4575631618499756 }, { "auxiliary_loss_clip": 0.01148762, "auxiliary_loss_mlp": 0.01022499, "balance_loss_clip": 1.04490614, "balance_loss_mlp": 1.01525426, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.6859170259011533, "language_loss": 0.77212524, "learning_rate": 2.3981680566294236e-08, "loss": 0.79383785, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.511439800262451 }, { "auxiliary_loss_clip": 0.01161373, "auxiliary_loss_mlp": 0.01024876, "balance_loss_clip": 1.04808199, "balance_loss_mlp": 1.01836765, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 1.8569559298984726, "language_loss": 0.73472846, "learning_rate": 2.3861560909416822e-08, "loss": 0.7565909, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.4702367782592773 }, { "auxiliary_loss_clip": 0.011024, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.04343975, "balance_loss_mlp": 1.02346063, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 1.7530768167979855, "language_loss": 0.82661229, "learning_rate": 2.3741741031169325e-08, "loss": 0.84794271, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.621312141418457 }, { "auxiliary_loss_clip": 0.01102955, "auxiliary_loss_mlp": 0.01024986, "balance_loss_clip": 1.04062366, "balance_loss_mlp": 1.01813126, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 1.7758829844082649, "language_loss": 0.71437663, "learning_rate": 2.3622220949728544e-08, "loss": 0.73565614, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.590489625930786 }, { "auxiliary_loss_clip": 0.01141133, "auxiliary_loss_mlp": 0.01023753, "balance_loss_clip": 1.043926, "balance_loss_mlp": 1.0164361, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 2.9703364674233748, "language_loss": 0.60959595, "learning_rate": 2.3503000683225526e-08, "loss": 0.63124478, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 3.3196325302124023 }, { "auxiliary_loss_clip": 0.01163396, "auxiliary_loss_mlp": 0.01028028, "balance_loss_clip": 1.0465095, "balance_loss_mlp": 1.02083373, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 2.1846749666102925, "language_loss": 0.84483755, "learning_rate": 2.3384080249745585e-08, "loss": 0.86675179, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.460360050201416 }, { "auxiliary_loss_clip": 0.01111341, "auxiliary_loss_mlp": 0.01024827, "balance_loss_clip": 1.04194558, "balance_loss_mlp": 1.01844299, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 2.527129576715102, "language_loss": 0.82441032, "learning_rate": 2.3265459667329178e-08, "loss": 0.84577197, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.7342448234558105 }, { "auxiliary_loss_clip": 0.01140465, "auxiliary_loss_mlp": 0.01024054, "balance_loss_clip": 1.04779387, "balance_loss_mlp": 1.01744342, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 1.9396786700388284, "language_loss": 0.85966545, "learning_rate": 2.31471389539708e-08, "loss": 0.8813107, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.5428967475891113 }, { "auxiliary_loss_clip": 0.01148997, "auxiliary_loss_mlp": 0.00760101, "balance_loss_clip": 1.04583526, "balance_loss_mlp": 1.00029182, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 1.8628895356412594, "language_loss": 0.72673297, "learning_rate": 2.3029118127619872e-08, "loss": 0.74582398, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.575881004333496 }, { "auxiliary_loss_clip": 0.0112444, "auxiliary_loss_mlp": 0.01025374, "balance_loss_clip": 1.04117024, "balance_loss_mlp": 1.01872182, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.150307555912813, "language_loss": 0.87056887, "learning_rate": 2.2911397206179628e-08, "loss": 0.89206707, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 2.5083611011505127 }, { "auxiliary_loss_clip": 0.01163636, "auxiliary_loss_mlp": 0.01026226, "balance_loss_clip": 1.04881251, "balance_loss_mlp": 1.01959467, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 1.7182645805307202, "language_loss": 0.62782133, "learning_rate": 2.279397620750845e-08, "loss": 0.64971995, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 2.4710488319396973 }, { "auxiliary_loss_clip": 0.01131594, "auxiliary_loss_mlp": 0.01030024, "balance_loss_clip": 1.04262137, "balance_loss_mlp": 1.02338147, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 1.8755278718336674, "language_loss": 0.7872082, "learning_rate": 2.2676855149419195e-08, "loss": 0.80882436, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 2.5160133838653564 }, { "auxiliary_loss_clip": 0.01130449, "auxiliary_loss_mlp": 0.0102796, "balance_loss_clip": 1.04719353, "balance_loss_mlp": 1.02031851, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.360955527824072, "language_loss": 0.75064731, "learning_rate": 2.2560034049678988e-08, "loss": 0.77223134, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 2.497328042984009 }, { "auxiliary_loss_clip": 0.01166309, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.04893017, "balance_loss_mlp": 1.01872241, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.6816326985458678, "language_loss": 0.75341535, "learning_rate": 2.2443512926008988e-08, "loss": 0.77533895, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 2.472205638885498 }, { "auxiliary_loss_clip": 0.01124612, "auxiliary_loss_mlp": 0.01026275, "balance_loss_clip": 1.04432535, "balance_loss_mlp": 1.01939392, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.32691376050846, "language_loss": 0.69540119, "learning_rate": 2.2327291796085946e-08, "loss": 0.71691006, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.5359432697296143 }, { "auxiliary_loss_clip": 0.01161918, "auxiliary_loss_mlp": 0.01023155, "balance_loss_clip": 1.04678321, "balance_loss_mlp": 1.015836, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 2.5417419410564697, "language_loss": 0.77222002, "learning_rate": 2.2211370677540197e-08, "loss": 0.79407078, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 2.4328887462615967 }, { "auxiliary_loss_clip": 0.01166106, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.04951131, "balance_loss_mlp": 1.02584839, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 2.5873844454302573, "language_loss": 0.78030205, "learning_rate": 2.2095749587957012e-08, "loss": 0.80228555, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 3.261431932449341 }, { "auxiliary_loss_clip": 0.01130144, "auxiliary_loss_mlp": 0.01023978, "balance_loss_clip": 1.04031932, "balance_loss_mlp": 1.01695979, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 1.810720897448241, "language_loss": 0.69742072, "learning_rate": 2.1980428544876138e-08, "loss": 0.71896195, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 3.3143246173858643 }, { "auxiliary_loss_clip": 0.01098271, "auxiliary_loss_mlp": 0.01021245, "balance_loss_clip": 1.03606629, "balance_loss_mlp": 1.01450026, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.6655063642925354, "language_loss": 0.73815668, "learning_rate": 2.1865407565791584e-08, "loss": 0.75935179, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.639575719833374 }, { "auxiliary_loss_clip": 0.01134288, "auxiliary_loss_mlp": 0.01021714, "balance_loss_clip": 1.04254246, "balance_loss_mlp": 1.0151515, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 1.9395849067260493, "language_loss": 0.77577543, "learning_rate": 2.175068666815183e-08, "loss": 0.79733551, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 3.2276721000671387 }, { "auxiliary_loss_clip": 0.01122672, "auxiliary_loss_mlp": 0.01035291, "balance_loss_clip": 1.04391956, "balance_loss_mlp": 1.0277276, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 2.075506215072328, "language_loss": 0.79049915, "learning_rate": 2.163626586935985e-08, "loss": 0.81207883, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.5457711219787598 }, { "auxiliary_loss_clip": 0.01144343, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.04336894, "balance_loss_mlp": 1.02100885, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 1.9382700284821877, "language_loss": 0.62922144, "learning_rate": 2.1522145186773755e-08, "loss": 0.65094763, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 2.539872646331787 }, { "auxiliary_loss_clip": 0.01134788, "auxiliary_loss_mlp": 0.0102525, "balance_loss_clip": 1.04499173, "balance_loss_mlp": 1.01827621, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 1.9304987341021955, "language_loss": 0.85650831, "learning_rate": 2.140832463770481e-08, "loss": 0.87810868, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.5629146099090576 }, { "auxiliary_loss_clip": 0.011364, "auxiliary_loss_mlp": 0.01026522, "balance_loss_clip": 1.04242373, "balance_loss_mlp": 1.01913393, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 2.2815317655781615, "language_loss": 0.76083231, "learning_rate": 2.129480423941987e-08, "loss": 0.78246152, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.5610032081604004 }, { "auxiliary_loss_clip": 0.01134823, "auxiliary_loss_mlp": 0.01022295, "balance_loss_clip": 1.04528093, "balance_loss_mlp": 1.01579475, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 1.6283804033895795, "language_loss": 0.80229771, "learning_rate": 2.1181584009140052e-08, "loss": 0.82386887, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 2.530864953994751 }, { "auxiliary_loss_clip": 0.01127447, "auxiliary_loss_mlp": 0.01021017, "balance_loss_clip": 1.0432303, "balance_loss_mlp": 1.01471364, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 2.203500013182331, "language_loss": 0.83454543, "learning_rate": 2.10686639640405e-08, "loss": 0.85603005, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.5469717979431152 }, { "auxiliary_loss_clip": 0.01145655, "auxiliary_loss_mlp": 0.01024409, "balance_loss_clip": 1.04357362, "balance_loss_mlp": 1.01711917, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 1.5649408122038833, "language_loss": 0.81138825, "learning_rate": 2.0956044121251294e-08, "loss": 0.83308887, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.5175137519836426 }, { "auxiliary_loss_clip": 0.01119496, "auxiliary_loss_mlp": 0.01023771, "balance_loss_clip": 1.04524159, "balance_loss_mlp": 1.01602817, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 1.7815938619649618, "language_loss": 0.81238055, "learning_rate": 2.084372449785654e-08, "loss": 0.83381325, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.5679187774658203 }, { "auxiliary_loss_clip": 0.01129985, "auxiliary_loss_mlp": 0.01024386, "balance_loss_clip": 1.04196763, "balance_loss_mlp": 1.01792216, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 1.6902451338785822, "language_loss": 0.68593669, "learning_rate": 2.0731705110895282e-08, "loss": 0.70748043, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.51924991607666 }, { "auxiliary_loss_clip": 0.01151677, "auxiliary_loss_mlp": 0.01029235, "balance_loss_clip": 1.04832244, "balance_loss_mlp": 1.02245831, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 2.1378180441024575, "language_loss": 0.86633778, "learning_rate": 2.0619985977360587e-08, "loss": 0.88814688, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.516890048980713 }, { "auxiliary_loss_clip": 0.01107319, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.03996718, "balance_loss_mlp": 1.01985538, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.806933176924031, "language_loss": 0.7659893, "learning_rate": 2.0508567114200237e-08, "loss": 0.78732276, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 3.4240827560424805 }, { "auxiliary_loss_clip": 0.01138483, "auxiliary_loss_mlp": 0.01023638, "balance_loss_clip": 1.04499078, "balance_loss_mlp": 1.01735592, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 1.9613280032332647, "language_loss": 0.78864652, "learning_rate": 2.0397448538316485e-08, "loss": 0.81026775, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.5868725776672363 }, { "auxiliary_loss_clip": 0.01114372, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 1.04110646, "balance_loss_mlp": 1.02106452, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 1.874655764486858, "language_loss": 0.66738731, "learning_rate": 2.028663026656563e-08, "loss": 0.68880272, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.564220428466797 }, { "auxiliary_loss_clip": 0.01161102, "auxiliary_loss_mlp": 0.00760581, "balance_loss_clip": 1.04664993, "balance_loss_mlp": 1.00033307, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 1.9809210709014204, "language_loss": 0.71698362, "learning_rate": 2.0176112315758885e-08, "loss": 0.73620045, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.451366662979126 }, { "auxiliary_loss_clip": 0.01114575, "auxiliary_loss_mlp": 0.0103035, "balance_loss_clip": 1.04311395, "balance_loss_mlp": 1.02289367, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 2.726791228028147, "language_loss": 0.69659364, "learning_rate": 2.0065894702661957e-08, "loss": 0.71804285, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.5818333625793457 }, { "auxiliary_loss_clip": 0.0111176, "auxiliary_loss_mlp": 0.00761138, "balance_loss_clip": 1.0395751, "balance_loss_mlp": 1.00029814, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 1.9120014640203198, "language_loss": 0.77680653, "learning_rate": 1.9955977443994577e-08, "loss": 0.7955355, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 2.5979108810424805 }, { "auxiliary_loss_clip": 0.01132851, "auxiliary_loss_mlp": 0.01025973, "balance_loss_clip": 1.04314661, "balance_loss_mlp": 1.0179739, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 2.60970077115704, "language_loss": 0.62206787, "learning_rate": 1.9846360556430965e-08, "loss": 0.64365613, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 2.5528271198272705 }, { "auxiliary_loss_clip": 0.01161423, "auxiliary_loss_mlp": 0.01023838, "balance_loss_clip": 1.04621172, "balance_loss_mlp": 1.01683736, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.1395829503083306, "language_loss": 0.61498952, "learning_rate": 1.973704405660004e-08, "loss": 0.63684213, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 2.5460877418518066 }, { "auxiliary_loss_clip": 0.01090174, "auxiliary_loss_mlp": 0.01024362, "balance_loss_clip": 1.03893149, "balance_loss_mlp": 1.01797509, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.4375879785027537, "language_loss": 0.77867413, "learning_rate": 1.9628027961085203e-08, "loss": 0.79981953, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 2.6747639179229736 }, { "auxiliary_loss_clip": 0.01109253, "auxiliary_loss_mlp": 0.0102387, "balance_loss_clip": 1.038486, "balance_loss_mlp": 1.01716483, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 1.8186485631267544, "language_loss": 0.84116441, "learning_rate": 1.9519312286423894e-08, "loss": 0.86249566, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 2.7127623558044434 }, { "auxiliary_loss_clip": 0.01147158, "auxiliary_loss_mlp": 0.01028298, "balance_loss_clip": 1.04654002, "balance_loss_mlp": 1.02106833, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 1.4927151193798343, "language_loss": 0.77671731, "learning_rate": 1.9410897049108255e-08, "loss": 0.79847193, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.509469747543335 }, { "auxiliary_loss_clip": 0.01169109, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.05068994, "balance_loss_mlp": 1.02333117, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 1.7084350605418128, "language_loss": 0.91182852, "learning_rate": 1.9302782265584905e-08, "loss": 0.93382573, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 2.494845151901245 }, { "auxiliary_loss_clip": 0.01094004, "auxiliary_loss_mlp": 0.01027163, "balance_loss_clip": 1.04119349, "balance_loss_mlp": 1.02073741, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.1382424909288744, "language_loss": 0.86973548, "learning_rate": 1.9194967952254282e-08, "loss": 0.8909471, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 3.380476713180542 }, { "auxiliary_loss_clip": 0.01150887, "auxiliary_loss_mlp": 0.01026989, "balance_loss_clip": 1.04972529, "balance_loss_mlp": 1.02016461, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 2.4840838323193397, "language_loss": 0.80781662, "learning_rate": 1.9087454125472635e-08, "loss": 0.82959533, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 3.310548782348633 }, { "auxiliary_loss_clip": 0.01164894, "auxiliary_loss_mlp": 0.01025772, "balance_loss_clip": 1.04920328, "balance_loss_mlp": 1.01817274, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 1.8096127494770102, "language_loss": 0.7824899, "learning_rate": 1.8980240801548696e-08, "loss": 0.80439651, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.479408025741577 }, { "auxiliary_loss_clip": 0.01135357, "auxiliary_loss_mlp": 0.01026083, "balance_loss_clip": 1.04787564, "balance_loss_mlp": 1.01969695, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 1.6374709454689123, "language_loss": 0.74479234, "learning_rate": 1.8873327996747458e-08, "loss": 0.76640677, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 3.2566113471984863 }, { "auxiliary_loss_clip": 0.0115175, "auxiliary_loss_mlp": 0.01023247, "balance_loss_clip": 1.04513717, "balance_loss_mlp": 1.01659214, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 2.5505086838690447, "language_loss": 0.65816391, "learning_rate": 1.8766715727287053e-08, "loss": 0.67991382, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 2.5714282989501953 }, { "auxiliary_loss_clip": 0.01152868, "auxiliary_loss_mlp": 0.00761068, "balance_loss_clip": 1.04611385, "balance_loss_mlp": 1.00034475, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 1.688603296093393, "language_loss": 0.79251325, "learning_rate": 1.8660404009340546e-08, "loss": 0.81165266, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.526607036590576 }, { "auxiliary_loss_clip": 0.01045076, "auxiliary_loss_mlp": 0.01001168, "balance_loss_clip": 1.0073595, "balance_loss_mlp": 1.00007689, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.8684609969920518, "language_loss": 0.59510231, "learning_rate": 1.8554392859035485e-08, "loss": 0.61556476, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.0949604511260986 }, { "auxiliary_loss_clip": 0.0108195, "auxiliary_loss_mlp": 0.01024565, "balance_loss_clip": 1.03858936, "balance_loss_mlp": 1.01751423, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 1.6473451490826385, "language_loss": 0.78981209, "learning_rate": 1.8448682292453444e-08, "loss": 0.8108772, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.6576881408691406 }, { "auxiliary_loss_clip": 0.01163706, "auxiliary_loss_mlp": 0.01026308, "balance_loss_clip": 1.04761469, "balance_loss_mlp": 1.01947129, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 1.6350014146219816, "language_loss": 0.65824908, "learning_rate": 1.8343272325631154e-08, "loss": 0.6801492, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 2.4339351654052734 }, { "auxiliary_loss_clip": 0.01084061, "auxiliary_loss_mlp": 0.00760742, "balance_loss_clip": 1.03899169, "balance_loss_mlp": 1.00028443, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.246627474268514, "language_loss": 0.77989918, "learning_rate": 1.8238162974558492e-08, "loss": 0.79834723, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 2.6724510192871094 }, { "auxiliary_loss_clip": 0.01134468, "auxiliary_loss_mlp": 0.010283, "balance_loss_clip": 1.04542005, "balance_loss_mlp": 1.02136171, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 1.875044826283786, "language_loss": 0.74728358, "learning_rate": 1.8133354255181144e-08, "loss": 0.76891124, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.5501389503479004 }, { "auxiliary_loss_clip": 0.01143606, "auxiliary_loss_mlp": 0.01022564, "balance_loss_clip": 1.04420853, "balance_loss_mlp": 1.01590061, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 1.6525956815057354, "language_loss": 0.74900681, "learning_rate": 1.802884618339795e-08, "loss": 0.77066845, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.5046470165252686 }, { "auxiliary_loss_clip": 0.01153006, "auxiliary_loss_mlp": 0.01029262, "balance_loss_clip": 1.04756451, "balance_loss_mlp": 1.02180207, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 1.9007492823122745, "language_loss": 0.80918318, "learning_rate": 1.7924638775062894e-08, "loss": 0.83100581, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.4831409454345703 }, { "auxiliary_loss_clip": 0.01118412, "auxiliary_loss_mlp": 0.01024822, "balance_loss_clip": 1.04557991, "balance_loss_mlp": 1.01789021, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 1.9689571904760679, "language_loss": 0.81484783, "learning_rate": 1.7820732045984444e-08, "loss": 0.83628023, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.592085361480713 }, { "auxiliary_loss_clip": 0.01144725, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.04435754, "balance_loss_mlp": 1.02311254, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 1.7689515913873297, "language_loss": 0.73986429, "learning_rate": 1.7717126011924655e-08, "loss": 0.76162016, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 3.2654218673706055 }, { "auxiliary_loss_clip": 0.01099422, "auxiliary_loss_mlp": 0.01029322, "balance_loss_clip": 1.03684878, "balance_loss_mlp": 1.0220356, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 2.5049492259722537, "language_loss": 0.7601344, "learning_rate": 1.7613820688600957e-08, "loss": 0.78142184, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.604381561279297 }, { "auxiliary_loss_clip": 0.01137727, "auxiliary_loss_mlp": 0.01021631, "balance_loss_clip": 1.04508519, "balance_loss_mlp": 1.01509833, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 1.8710593928655854, "language_loss": 0.78712595, "learning_rate": 1.7510816091684588e-08, "loss": 0.80871952, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.5552797317504883 }, { "auxiliary_loss_clip": 0.01136931, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.04613435, "balance_loss_mlp": 1.02098393, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 3.3617539517850727, "language_loss": 0.78402609, "learning_rate": 1.740811223680083e-08, "loss": 0.80568373, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.539546012878418 }, { "auxiliary_loss_clip": 0.01163127, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.04817355, "balance_loss_mlp": 1.01989102, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 3.902698516837599, "language_loss": 0.74071181, "learning_rate": 1.7305709139530334e-08, "loss": 0.7626121, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 2.4497592449188232 }, { "auxiliary_loss_clip": 0.01141815, "auxiliary_loss_mlp": 0.01021717, "balance_loss_clip": 1.04331708, "balance_loss_mlp": 1.01474333, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 2.2719415736336988, "language_loss": 0.74410778, "learning_rate": 1.7203606815407334e-08, "loss": 0.76574308, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.45404314994812 }, { "auxiliary_loss_clip": 0.01139608, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.04702401, "balance_loss_mlp": 1.02060664, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 1.674285512132708, "language_loss": 0.79451597, "learning_rate": 1.7101805279920557e-08, "loss": 0.81619209, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 2.523052215576172 }, { "auxiliary_loss_clip": 0.01163699, "auxiliary_loss_mlp": 0.01027359, "balance_loss_clip": 1.04839087, "balance_loss_mlp": 1.02066231, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 1.9170229606506757, "language_loss": 0.80864215, "learning_rate": 1.7000304548513643e-08, "loss": 0.8305527, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 2.4750542640686035 }, { "auxiliary_loss_clip": 0.01117557, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.04205489, "balance_loss_mlp": 1.02269757, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 1.9069935938248344, "language_loss": 0.82814127, "learning_rate": 1.6899104636583394e-08, "loss": 0.84961486, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 2.5450358390808105 }, { "auxiliary_loss_clip": 0.01045617, "auxiliary_loss_mlp": 0.01002766, "balance_loss_clip": 1.00732327, "balance_loss_mlp": 1.00166321, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7226842863766808, "language_loss": 0.61933124, "learning_rate": 1.6798205559482638e-08, "loss": 0.63981509, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 3.2594106197357178 }, { "auxiliary_loss_clip": 0.01120318, "auxiliary_loss_mlp": 0.01022001, "balance_loss_clip": 1.04369044, "balance_loss_mlp": 1.01565909, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 1.718562349535918, "language_loss": 0.76443392, "learning_rate": 1.669760733251713e-08, "loss": 0.78585708, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.5551419258117676 }, { "auxiliary_loss_clip": 0.01098306, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.04177082, "balance_loss_mlp": 1.0195713, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.936057030903933, "language_loss": 0.82363009, "learning_rate": 1.659730997094755e-08, "loss": 0.8448692, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 2.6322832107543945 }, { "auxiliary_loss_clip": 0.011418, "auxiliary_loss_mlp": 0.01025123, "balance_loss_clip": 1.04342496, "balance_loss_mlp": 1.01866817, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 1.8587172726778574, "language_loss": 0.61934876, "learning_rate": 1.6497313489989283e-08, "loss": 0.64101803, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 3.2295594215393066 }, { "auxiliary_loss_clip": 0.0110375, "auxiliary_loss_mlp": 0.01023529, "balance_loss_clip": 1.03555393, "balance_loss_mlp": 1.01674032, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 2.992291760722505, "language_loss": 0.70147073, "learning_rate": 1.639761790481131e-08, "loss": 0.72274351, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 3.438467025756836 }, { "auxiliary_loss_clip": 0.01148115, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.04554462, "balance_loss_mlp": 1.02030492, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 1.9602602835153848, "language_loss": 0.79494596, "learning_rate": 1.6298223230537754e-08, "loss": 0.81669861, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.5490968227386475 }, { "auxiliary_loss_clip": 0.01134268, "auxiliary_loss_mlp": 0.00761482, "balance_loss_clip": 1.04463458, "balance_loss_mlp": 1.0003047, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 1.8592926522215996, "language_loss": 0.69862187, "learning_rate": 1.619912948224611e-08, "loss": 0.71757936, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 3.392298460006714 }, { "auxiliary_loss_clip": 0.01116401, "auxiliary_loss_mlp": 0.01027179, "balance_loss_clip": 1.0429635, "balance_loss_mlp": 1.02043796, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 2.104105829381176, "language_loss": 0.61350882, "learning_rate": 1.6100336674969682e-08, "loss": 0.63494462, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 2.598083019256592 }, { "auxiliary_loss_clip": 0.01105148, "auxiliary_loss_mlp": 0.01029362, "balance_loss_clip": 1.03935313, "balance_loss_mlp": 1.02231371, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 1.6998164256453825, "language_loss": 0.76649106, "learning_rate": 1.600184482369449e-08, "loss": 0.78783613, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.6181211471557617 }, { "auxiliary_loss_clip": 0.01122706, "auxiliary_loss_mlp": 0.01026539, "balance_loss_clip": 1.04163432, "balance_loss_mlp": 1.01827741, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 2.1901811447142285, "language_loss": 0.89137876, "learning_rate": 1.5903653943362126e-08, "loss": 0.91287124, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.555037498474121 }, { "auxiliary_loss_clip": 0.01137653, "auxiliary_loss_mlp": 0.01031259, "balance_loss_clip": 1.04616225, "balance_loss_mlp": 1.02453279, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 1.836182927726961, "language_loss": 0.77256715, "learning_rate": 1.580576404886802e-08, "loss": 0.79425633, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.538790225982666 }, { "auxiliary_loss_clip": 0.01148479, "auxiliary_loss_mlp": 0.01023767, "balance_loss_clip": 1.04520273, "balance_loss_mlp": 1.01762521, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 1.8825170543333891, "language_loss": 0.79666519, "learning_rate": 1.570817515506162e-08, "loss": 0.81838763, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.499115467071533 }, { "auxiliary_loss_clip": 0.01161538, "auxiliary_loss_mlp": 0.01027221, "balance_loss_clip": 1.04777336, "balance_loss_mlp": 1.02045608, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 2.057360885495906, "language_loss": 0.81423044, "learning_rate": 1.561088727674753e-08, "loss": 0.83611798, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 2.42051100730896 }, { "auxiliary_loss_clip": 0.01119659, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.04361892, "balance_loss_mlp": 1.02332008, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 2.2726108391107225, "language_loss": 0.71202362, "learning_rate": 1.551390042868417e-08, "loss": 0.73352695, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.6651251316070557 }, { "auxiliary_loss_clip": 0.01149182, "auxiliary_loss_mlp": 0.01029836, "balance_loss_clip": 1.04587507, "balance_loss_mlp": 1.02305591, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 1.6851700242856786, "language_loss": 0.70589644, "learning_rate": 1.5417214625584207e-08, "loss": 0.72768658, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.4734814167022705 }, { "auxiliary_loss_clip": 0.01143264, "auxiliary_loss_mlp": 0.01022389, "balance_loss_clip": 1.0436101, "balance_loss_mlp": 1.01531148, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 1.7566921800583892, "language_loss": 0.85435522, "learning_rate": 1.5320829882114806e-08, "loss": 0.87601173, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.4975128173828125 }, { "auxiliary_loss_clip": 0.01161409, "auxiliary_loss_mlp": 0.01026055, "balance_loss_clip": 1.04583633, "balance_loss_mlp": 1.01881349, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 1.849774283957806, "language_loss": 0.78620625, "learning_rate": 1.5224746212897378e-08, "loss": 0.80808085, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.464493989944458 }, { "auxiliary_loss_clip": 0.01158189, "auxiliary_loss_mlp": 0.01026117, "balance_loss_clip": 1.04587173, "balance_loss_mlp": 1.01898265, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 1.6863043696975948, "language_loss": 0.77466083, "learning_rate": 1.512896363250804e-08, "loss": 0.7965039, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.4748361110687256 }, { "auxiliary_loss_clip": 0.011511, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.0450511, "balance_loss_mlp": 1.01952446, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 2.0285535344219956, "language_loss": 0.75599778, "learning_rate": 1.503348215547673e-08, "loss": 0.77777421, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 3.2231593132019043 }, { "auxiliary_loss_clip": 0.01132075, "auxiliary_loss_mlp": 0.01024739, "balance_loss_clip": 1.04357624, "balance_loss_mlp": 1.01775897, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 1.7545911779760777, "language_loss": 0.80729938, "learning_rate": 1.4938301796288078e-08, "loss": 0.82886755, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.502425193786621 }, { "auxiliary_loss_clip": 0.01163318, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.04760408, "balance_loss_mlp": 1.02527165, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 2.137984722156509, "language_loss": 0.81459463, "learning_rate": 1.4843422569380537e-08, "loss": 0.8365525, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.41196870803833 }, { "auxiliary_loss_clip": 0.01101641, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.03995061, "balance_loss_mlp": 1.01965094, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 1.6015325716235473, "language_loss": 0.83028871, "learning_rate": 1.4748844489147483e-08, "loss": 0.85157144, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 2.5983951091766357 }, { "auxiliary_loss_clip": 0.01134274, "auxiliary_loss_mlp": 0.01024603, "balance_loss_clip": 1.04218674, "balance_loss_mlp": 1.0183773, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 1.877892659116549, "language_loss": 0.70602912, "learning_rate": 1.4654567569936326e-08, "loss": 0.72761786, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.4951412677764893 }, { "auxiliary_loss_clip": 0.01103083, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 1.04124856, "balance_loss_mlp": 1.01883626, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 2.005413088382172, "language_loss": 0.83309728, "learning_rate": 1.456059182604874e-08, "loss": 0.85439163, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 2.5613787174224854 }, { "auxiliary_loss_clip": 0.01164235, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.0485431, "balance_loss_mlp": 1.02023065, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 1.7753137676770518, "language_loss": 0.76191568, "learning_rate": 1.4466917271740653e-08, "loss": 0.78383291, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.4156441688537598 }, { "auxiliary_loss_clip": 0.01130965, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.04409993, "balance_loss_mlp": 1.02215874, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 1.9766583494069903, "language_loss": 0.67520797, "learning_rate": 1.4373543921222697e-08, "loss": 0.69681472, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 2.5161733627319336 }, { "auxiliary_loss_clip": 0.01131438, "auxiliary_loss_mlp": 0.0101967, "balance_loss_clip": 1.04498756, "balance_loss_mlp": 1.01262498, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 1.9634002622344724, "language_loss": 0.77854943, "learning_rate": 1.428047178865932e-08, "loss": 0.80006051, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 2.486952781677246 }, { "auxiliary_loss_clip": 0.01130944, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04096889, "balance_loss_mlp": 1.01648903, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.5905516074286141, "language_loss": 0.74504638, "learning_rate": 1.4187700888169451e-08, "loss": 0.76659214, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.5184552669525146 }, { "auxiliary_loss_clip": 0.01044979, "auxiliary_loss_mlp": 0.01001636, "balance_loss_clip": 1.00886941, "balance_loss_mlp": 1.00052691, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7794178827847943, "language_loss": 0.57004815, "learning_rate": 1.40952312338265e-08, "loss": 0.5905143, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 3.1478281021118164 }, { "auxiliary_loss_clip": 0.01122401, "auxiliary_loss_mlp": 0.01025662, "balance_loss_clip": 1.04165483, "balance_loss_mlp": 1.0190165, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 1.724001864901442, "language_loss": 0.6837216, "learning_rate": 1.4003062839657909e-08, "loss": 0.70520228, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 3.5965487957000732 }, { "auxiliary_loss_clip": 0.01120899, "auxiliary_loss_mlp": 0.01024161, "balance_loss_clip": 1.04278278, "balance_loss_mlp": 1.01685667, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 1.6030984427089152, "language_loss": 0.80081511, "learning_rate": 1.391119571964583e-08, "loss": 0.82226562, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 3.373582363128662 }, { "auxiliary_loss_clip": 0.01145904, "auxiliary_loss_mlp": 0.0102656, "balance_loss_clip": 1.04472685, "balance_loss_mlp": 1.02010202, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 1.8235726575009459, "language_loss": 0.72715318, "learning_rate": 1.3819629887726225e-08, "loss": 0.74887782, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.4722163677215576 }, { "auxiliary_loss_clip": 0.01141567, "auxiliary_loss_mlp": 0.01023205, "balance_loss_clip": 1.04645801, "balance_loss_mlp": 1.01639795, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 1.7471794693052107, "language_loss": 0.76093793, "learning_rate": 1.3728365357789317e-08, "loss": 0.78258562, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 3.3034112453460693 }, { "auxiliary_loss_clip": 0.01081123, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.03944802, "balance_loss_mlp": 1.02334976, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 4.740638599253653, "language_loss": 0.76497662, "learning_rate": 1.3637402143680254e-08, "loss": 0.78609163, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 2.6376500129699707 }, { "auxiliary_loss_clip": 0.0101512, "auxiliary_loss_mlp": 0.01001466, "balance_loss_clip": 1.00894451, "balance_loss_mlp": 1.00045824, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.720713846831545, "language_loss": 0.55068421, "learning_rate": 1.3546740259197998e-08, "loss": 0.57085013, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.090174913406372 }, { "auxiliary_loss_clip": 0.01134884, "auxiliary_loss_mlp": 0.01031663, "balance_loss_clip": 1.04525805, "balance_loss_mlp": 1.02403975, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 1.8184694991409707, "language_loss": 0.69451487, "learning_rate": 1.3456379718095989e-08, "loss": 0.71618032, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.5589301586151123 }, { "auxiliary_loss_clip": 0.01033026, "auxiliary_loss_mlp": 0.01002611, "balance_loss_clip": 1.00700903, "balance_loss_mlp": 1.00152004, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.8369845924738059, "language_loss": 0.62050074, "learning_rate": 1.3366320534081487e-08, "loss": 0.6408571, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.133159875869751 }, { "auxiliary_loss_clip": 0.01148231, "auxiliary_loss_mlp": 0.01025795, "balance_loss_clip": 1.04605758, "balance_loss_mlp": 1.0185585, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 2.0691412256184565, "language_loss": 0.75809002, "learning_rate": 1.3276562720816675e-08, "loss": 0.77983034, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.556793451309204 }, { "auxiliary_loss_clip": 0.0116206, "auxiliary_loss_mlp": 0.01022247, "balance_loss_clip": 1.04591751, "balance_loss_mlp": 1.01497817, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 1.9585675962535611, "language_loss": 0.82268924, "learning_rate": 1.3187106291917549e-08, "loss": 0.84453225, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.4417073726654053 }, { "auxiliary_loss_clip": 0.01144539, "auxiliary_loss_mlp": 0.01022393, "balance_loss_clip": 1.04507875, "balance_loss_mlp": 1.01549625, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.8680588291869193, "language_loss": 0.70478874, "learning_rate": 1.309795126095503e-08, "loss": 0.72645807, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 2.473043441772461 }, { "auxiliary_loss_clip": 0.01073435, "auxiliary_loss_mlp": 0.01023473, "balance_loss_clip": 1.03568912, "balance_loss_mlp": 1.01645446, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.0155131711993794, "language_loss": 0.80428147, "learning_rate": 1.3009097641453192e-08, "loss": 0.82525051, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.636964797973633 }, { "auxiliary_loss_clip": 0.01131064, "auxiliary_loss_mlp": 0.01021894, "balance_loss_clip": 1.04320383, "balance_loss_mlp": 1.01537037, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 1.7013841529184617, "language_loss": 0.75984609, "learning_rate": 1.2920545446891474e-08, "loss": 0.78137571, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 2.499638557434082 }, { "auxiliary_loss_clip": 0.01139012, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.04730082, "balance_loss_mlp": 1.02530241, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 1.6335595392790765, "language_loss": 0.70644033, "learning_rate": 1.2832294690703127e-08, "loss": 0.72815037, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.5447640419006348 }, { "auxiliary_loss_clip": 0.01148508, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.04738665, "balance_loss_mlp": 1.01968956, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 1.8873550800801469, "language_loss": 0.77633178, "learning_rate": 1.2744345386275668e-08, "loss": 0.79808241, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 3.336887836456299 }, { "auxiliary_loss_clip": 0.01138753, "auxiliary_loss_mlp": 0.0102854, "balance_loss_clip": 1.04812884, "balance_loss_mlp": 1.02171838, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 2.591240529162104, "language_loss": 0.78804624, "learning_rate": 1.265669754695109e-08, "loss": 0.80971915, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.5648958683013916 }, { "auxiliary_loss_clip": 0.01092723, "auxiliary_loss_mlp": 0.01028962, "balance_loss_clip": 1.03666782, "balance_loss_mlp": 1.02228951, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 2.003720901965906, "language_loss": 0.82313472, "learning_rate": 1.2569351186025201e-08, "loss": 0.84435153, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.60941481590271 }, { "auxiliary_loss_clip": 0.0111078, "auxiliary_loss_mlp": 0.01027008, "balance_loss_clip": 1.04040575, "balance_loss_mlp": 1.02040684, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 1.4928113826956364, "language_loss": 0.75279081, "learning_rate": 1.2482306316748737e-08, "loss": 0.77416867, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.60005521774292 }, { "auxiliary_loss_clip": 0.01154075, "auxiliary_loss_mlp": 0.01026278, "balance_loss_clip": 1.0455364, "balance_loss_mlp": 1.01919127, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 2.009953060543939, "language_loss": 0.78113323, "learning_rate": 1.2395562952326021e-08, "loss": 0.80293679, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 2.5015058517456055 }, { "auxiliary_loss_clip": 0.01145365, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.04739654, "balance_loss_mlp": 1.02528977, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 2.0495203606173953, "language_loss": 0.81058371, "learning_rate": 1.2309121105916309e-08, "loss": 0.83236474, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.540872573852539 }, { "auxiliary_loss_clip": 0.01152545, "auxiliary_loss_mlp": 0.0102345, "balance_loss_clip": 1.04607797, "balance_loss_mlp": 1.01616597, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 1.9882603532194607, "language_loss": 0.69072533, "learning_rate": 1.222298079063222e-08, "loss": 0.71248519, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 2.624593734741211 }, { "auxiliary_loss_clip": 0.01147167, "auxiliary_loss_mlp": 0.01029301, "balance_loss_clip": 1.04459548, "balance_loss_mlp": 1.02285147, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 2.0659332924208464, "language_loss": 0.72624552, "learning_rate": 1.2137142019541524e-08, "loss": 0.74801022, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 2.5246968269348145 }, { "auxiliary_loss_clip": 0.01138128, "auxiliary_loss_mlp": 0.01024463, "balance_loss_clip": 1.04474676, "balance_loss_mlp": 1.01778173, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 1.8572177958528888, "language_loss": 0.73451936, "learning_rate": 1.2051604805666027e-08, "loss": 0.7561453, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 2.5371508598327637 }, { "auxiliary_loss_clip": 0.01162816, "auxiliary_loss_mlp": 0.00760801, "balance_loss_clip": 1.04788685, "balance_loss_mlp": 1.00030971, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 2.0617299561210074, "language_loss": 0.78705388, "learning_rate": 1.196636916198135e-08, "loss": 0.80629009, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 2.4587206840515137 }, { "auxiliary_loss_clip": 0.01165419, "auxiliary_loss_mlp": 0.01019921, "balance_loss_clip": 1.04791093, "balance_loss_mlp": 1.01353776, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 2.0605757614902607, "language_loss": 0.77120352, "learning_rate": 1.1881435101418036e-08, "loss": 0.79305691, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.4489939212799072 }, { "auxiliary_loss_clip": 0.01036253, "auxiliary_loss_mlp": 0.01004656, "balance_loss_clip": 1.00958371, "balance_loss_mlp": 1.00346982, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.726068738057863, "language_loss": 0.65608633, "learning_rate": 1.1796802636860003e-08, "loss": 0.67649543, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 3.165302276611328 }, { "auxiliary_loss_clip": 0.01164414, "auxiliary_loss_mlp": 0.01023534, "balance_loss_clip": 1.04721463, "balance_loss_mlp": 1.01630056, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 1.8186478840542408, "language_loss": 0.73872185, "learning_rate": 1.1712471781146316e-08, "loss": 0.76060128, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 3.3481268882751465 }, { "auxiliary_loss_clip": 0.01161295, "auxiliary_loss_mlp": 0.01022796, "balance_loss_clip": 1.04602647, "balance_loss_mlp": 1.0158577, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 1.8243842175417146, "language_loss": 0.66707993, "learning_rate": 1.1628442547069628e-08, "loss": 0.68892074, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 3.4186246395111084 }, { "auxiliary_loss_clip": 0.01151956, "auxiliary_loss_mlp": 0.00760898, "balance_loss_clip": 1.04544163, "balance_loss_mlp": 1.00032139, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 2.223766572856778, "language_loss": 0.77345681, "learning_rate": 1.1544714947377521e-08, "loss": 0.79258537, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.502368927001953 }, { "auxiliary_loss_clip": 0.01165358, "auxiliary_loss_mlp": 0.01021985, "balance_loss_clip": 1.04885459, "balance_loss_mlp": 1.01499605, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 2.116444685873814, "language_loss": 0.69865924, "learning_rate": 1.1461288994770945e-08, "loss": 0.72053266, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 3.158148765563965 }, { "auxiliary_loss_clip": 0.01164267, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.04609609, "balance_loss_mlp": 1.0239706, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 1.592504663399579, "language_loss": 0.77075821, "learning_rate": 1.1378164701906002e-08, "loss": 0.79272127, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 2.507467746734619 }, { "auxiliary_loss_clip": 0.01164516, "auxiliary_loss_mlp": 0.01027153, "balance_loss_clip": 1.04721248, "balance_loss_mlp": 1.02015853, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 1.851034624061108, "language_loss": 0.6688627, "learning_rate": 1.1295342081392156e-08, "loss": 0.69077933, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.441541910171509 }, { "auxiliary_loss_clip": 0.01134592, "auxiliary_loss_mlp": 0.01024117, "balance_loss_clip": 1.04525661, "balance_loss_mlp": 1.01770914, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.7795042378469135, "language_loss": 0.69441521, "learning_rate": 1.1212821145793804e-08, "loss": 0.71600223, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.5213499069213867 }, { "auxiliary_loss_clip": 0.01134966, "auxiliary_loss_mlp": 0.01025128, "balance_loss_clip": 1.0425694, "balance_loss_mlp": 1.01855052, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 1.971050004116924, "language_loss": 0.78719628, "learning_rate": 1.1130601907629156e-08, "loss": 0.80879724, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.492058277130127 }, { "auxiliary_loss_clip": 0.01045608, "auxiliary_loss_mlp": 0.01001667, "balance_loss_clip": 1.00739837, "balance_loss_mlp": 1.00052834, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 0.8126656294730421, "language_loss": 0.64934099, "learning_rate": 1.1048684379370899e-08, "loss": 0.66981375, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.052504301071167 }, { "auxiliary_loss_clip": 0.01124525, "auxiliary_loss_mlp": 0.01030192, "balance_loss_clip": 1.04240799, "balance_loss_mlp": 1.02377534, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 2.3702098241923855, "language_loss": 0.74418485, "learning_rate": 1.0967068573445759e-08, "loss": 0.76573205, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.4811084270477295 }, { "auxiliary_loss_clip": 0.0113343, "auxiliary_loss_mlp": 0.01025286, "balance_loss_clip": 1.04336452, "balance_loss_mlp": 1.01824093, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.0145606640807414, "language_loss": 0.65232885, "learning_rate": 1.0885754502234945e-08, "loss": 0.67391598, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.526123046875 }, { "auxiliary_loss_clip": 0.0111399, "auxiliary_loss_mlp": 0.01023044, "balance_loss_clip": 1.04245353, "balance_loss_mlp": 1.01636231, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 1.9384649418964794, "language_loss": 0.7771014, "learning_rate": 1.08047421780737e-08, "loss": 0.79847175, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 2.5408973693847656 }, { "auxiliary_loss_clip": 0.01142995, "auxiliary_loss_mlp": 0.00760694, "balance_loss_clip": 1.0464282, "balance_loss_mlp": 1.00026476, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 1.9868533916047577, "language_loss": 0.73874098, "learning_rate": 1.0724031613251305e-08, "loss": 0.75777787, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.527113914489746 }, { "auxiliary_loss_clip": 0.01154689, "auxiliary_loss_mlp": 0.01029758, "balance_loss_clip": 1.04585457, "balance_loss_mlp": 1.02231908, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 2.193039141365398, "language_loss": 0.66236264, "learning_rate": 1.0643622820011744e-08, "loss": 0.68420708, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.526540994644165 }, { "auxiliary_loss_clip": 0.01165673, "auxiliary_loss_mlp": 0.01024026, "balance_loss_clip": 1.04705775, "balance_loss_mlp": 1.01684105, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 2.220781664766999, "language_loss": 0.68001628, "learning_rate": 1.0563515810552814e-08, "loss": 0.70191324, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 3.1951167583465576 }, { "auxiliary_loss_clip": 0.01163723, "auxiliary_loss_mlp": 0.01025297, "balance_loss_clip": 1.04853463, "balance_loss_mlp": 1.01877046, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.485311455285863, "language_loss": 0.73130739, "learning_rate": 1.0483710597026795e-08, "loss": 0.75319755, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.4701015949249268 }, { "auxiliary_loss_clip": 0.01121117, "auxiliary_loss_mlp": 0.01023842, "balance_loss_clip": 1.04428351, "balance_loss_mlp": 1.01716948, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 2.0197127879419994, "language_loss": 0.73988187, "learning_rate": 1.0404207191540227e-08, "loss": 0.76133144, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.5984811782836914 }, { "auxiliary_loss_clip": 0.0116225, "auxiliary_loss_mlp": 0.01025857, "balance_loss_clip": 1.04755437, "balance_loss_mlp": 1.01926231, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 1.7973320348293138, "language_loss": 0.74580777, "learning_rate": 1.0325005606153236e-08, "loss": 0.76768887, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.5618155002593994 }, { "auxiliary_loss_clip": 0.01105739, "auxiliary_loss_mlp": 0.01026144, "balance_loss_clip": 1.04054284, "balance_loss_mlp": 1.0193069, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.461169327159431, "language_loss": 0.78769219, "learning_rate": 1.0246105852881104e-08, "loss": 0.80901098, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 2.6134443283081055 }, { "auxiliary_loss_clip": 0.01164731, "auxiliary_loss_mlp": 0.01024256, "balance_loss_clip": 1.04690683, "balance_loss_mlp": 1.01717782, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 1.7879268187574222, "language_loss": 0.78713799, "learning_rate": 1.0167507943692476e-08, "loss": 0.80902779, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.5377798080444336 }, { "auxiliary_loss_clip": 0.01148445, "auxiliary_loss_mlp": 0.01028977, "balance_loss_clip": 1.04938304, "balance_loss_mlp": 1.02199447, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 2.1446800513721884, "language_loss": 0.71085203, "learning_rate": 1.008921189051093e-08, "loss": 0.73262626, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.527008295059204 }, { "auxiliary_loss_clip": 0.01165992, "auxiliary_loss_mlp": 0.01026387, "balance_loss_clip": 1.04906547, "balance_loss_mlp": 1.01870668, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 1.919580542145979, "language_loss": 0.7686522, "learning_rate": 1.0011217705213848e-08, "loss": 0.79057604, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 2.5254733562469482 }, { "auxiliary_loss_clip": 0.01146268, "auxiliary_loss_mlp": 0.01022523, "balance_loss_clip": 1.04599035, "balance_loss_mlp": 1.01645541, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 1.7550523547618049, "language_loss": 0.74199909, "learning_rate": 9.933525399632658e-09, "loss": 0.76368701, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 2.610304832458496 }, { "auxiliary_loss_clip": 0.01130959, "auxiliary_loss_mlp": 0.01026309, "balance_loss_clip": 1.04256654, "balance_loss_mlp": 1.01895034, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 1.7486629397755933, "language_loss": 0.64891952, "learning_rate": 9.856134985553488e-09, "loss": 0.67049217, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 2.6411588191986084 }, { "auxiliary_loss_clip": 0.01163483, "auxiliary_loss_mlp": 0.01026599, "balance_loss_clip": 1.04761577, "balance_loss_mlp": 1.01916599, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.538402884096897, "language_loss": 0.73476315, "learning_rate": 9.77904647471628e-09, "loss": 0.75666398, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 2.5111989974975586 }, { "auxiliary_loss_clip": 0.01097147, "auxiliary_loss_mlp": 0.01026132, "balance_loss_clip": 1.04008579, "balance_loss_mlp": 1.0190661, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 1.4112846098866993, "language_loss": 0.73961675, "learning_rate": 9.702259878815454e-09, "loss": 0.76084948, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.590095043182373 }, { "auxiliary_loss_clip": 0.01152205, "auxiliary_loss_mlp": 0.01039069, "balance_loss_clip": 1.04749322, "balance_loss_mlp": 1.03074849, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.0668136196081446, "language_loss": 0.74316698, "learning_rate": 9.625775209499254e-09, "loss": 0.76507974, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 3.216752290725708 }, { "auxiliary_loss_clip": 0.01115396, "auxiliary_loss_mlp": 0.010284, "balance_loss_clip": 1.04031372, "balance_loss_mlp": 1.02107418, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 1.9153722257479209, "language_loss": 0.74084312, "learning_rate": 9.549592478370172e-09, "loss": 0.76228106, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 3.3043999671936035 }, { "auxiliary_loss_clip": 0.01151803, "auxiliary_loss_mlp": 0.010282, "balance_loss_clip": 1.0459131, "balance_loss_mlp": 1.02150369, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 1.7098136012528384, "language_loss": 0.79346538, "learning_rate": 9.473711696985632e-09, "loss": 0.81526542, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.486731767654419 }, { "auxiliary_loss_clip": 0.01129948, "auxiliary_loss_mlp": 0.01021145, "balance_loss_clip": 1.04362917, "balance_loss_mlp": 1.01378345, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 2.0947633348214207, "language_loss": 0.75658727, "learning_rate": 9.398132876856201e-09, "loss": 0.77809817, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 3.1684863567352295 }, { "auxiliary_loss_clip": 0.01015668, "auxiliary_loss_mlp": 0.01002015, "balance_loss_clip": 1.00845218, "balance_loss_mlp": 1.00082898, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7969104749980803, "language_loss": 0.60806471, "learning_rate": 9.322856029447379e-09, "loss": 0.62824154, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 3.0715153217315674 }, { "auxiliary_loss_clip": 0.01159466, "auxiliary_loss_mlp": 0.01024808, "balance_loss_clip": 1.04563808, "balance_loss_mlp": 1.01827264, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 1.9462868313382278, "language_loss": 0.80149496, "learning_rate": 9.247881166178695e-09, "loss": 0.82333767, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.4667961597442627 }, { "auxiliary_loss_clip": 0.01129881, "auxiliary_loss_mlp": 0.01027024, "balance_loss_clip": 1.0436312, "balance_loss_mlp": 1.02007711, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.2430495345324006, "language_loss": 0.76865351, "learning_rate": 9.173208298423274e-09, "loss": 0.79022253, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 2.6132006645202637 }, { "auxiliary_loss_clip": 0.01099928, "auxiliary_loss_mlp": 0.00760612, "balance_loss_clip": 1.03994918, "balance_loss_mlp": 1.00030088, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.5262850194703033, "language_loss": 0.76314145, "learning_rate": 9.09883743750961e-09, "loss": 0.78174686, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.670663356781006 }, { "auxiliary_loss_clip": 0.01132525, "auxiliary_loss_mlp": 0.01021422, "balance_loss_clip": 1.04459739, "balance_loss_mlp": 1.01496065, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 1.5953721746917666, "language_loss": 0.84062743, "learning_rate": 9.024768594719124e-09, "loss": 0.86216688, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.5339019298553467 }, { "auxiliary_loss_clip": 0.01118864, "auxiliary_loss_mlp": 0.0102202, "balance_loss_clip": 1.04227567, "balance_loss_mlp": 1.01494491, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 2.0774861144161267, "language_loss": 0.72566676, "learning_rate": 8.95100178128816e-09, "loss": 0.74707556, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.5342602729797363 }, { "auxiliary_loss_clip": 0.01134596, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.04360175, "balance_loss_mlp": 1.01952589, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 1.8243659228545839, "language_loss": 0.70059413, "learning_rate": 8.877537008407321e-09, "loss": 0.72220904, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.6209964752197266 }, { "auxiliary_loss_clip": 0.01139122, "auxiliary_loss_mlp": 0.01024687, "balance_loss_clip": 1.04533923, "balance_loss_mlp": 1.01743925, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 1.6304366229978284, "language_loss": 0.68915123, "learning_rate": 8.804374287221028e-09, "loss": 0.71078932, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.594649076461792 }, { "auxiliary_loss_clip": 0.01110122, "auxiliary_loss_mlp": 0.01025043, "balance_loss_clip": 1.03663349, "balance_loss_mlp": 1.01835251, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 1.5925995348221036, "language_loss": 0.8452059, "learning_rate": 8.731513628827958e-09, "loss": 0.86655748, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 2.6688039302825928 }, { "auxiliary_loss_clip": 0.01150363, "auxiliary_loss_mlp": 0.01023915, "balance_loss_clip": 1.04595697, "balance_loss_mlp": 1.0175494, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 1.7963410357847347, "language_loss": 0.8237555, "learning_rate": 8.658955044280825e-09, "loss": 0.84549826, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.5142860412597656 }, { "auxiliary_loss_clip": 0.01143944, "auxiliary_loss_mlp": 0.0102763, "balance_loss_clip": 1.04519153, "balance_loss_mlp": 1.02082014, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 1.9055696764295664, "language_loss": 0.77714193, "learning_rate": 8.586698544587268e-09, "loss": 0.79885769, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 3.207247257232666 }, { "auxiliary_loss_clip": 0.01127779, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.04272985, "balance_loss_mlp": 1.02348483, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 2.2947661143438864, "language_loss": 0.73990333, "learning_rate": 8.514744140707853e-09, "loss": 0.76148522, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.5396230220794678 }, { "auxiliary_loss_clip": 0.01163397, "auxiliary_loss_mlp": 0.01024535, "balance_loss_clip": 1.04877496, "balance_loss_mlp": 1.01799941, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.5626393340455627, "language_loss": 0.76435322, "learning_rate": 8.443091843558515e-09, "loss": 0.78623247, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.453338384628296 }, { "auxiliary_loss_clip": 0.01130958, "auxiliary_loss_mlp": 0.01025416, "balance_loss_clip": 1.04528356, "balance_loss_mlp": 1.017977, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 2.1393933601517148, "language_loss": 0.6463204, "learning_rate": 8.37174166400878e-09, "loss": 0.66788417, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.5425164699554443 }, { "auxiliary_loss_clip": 0.01165516, "auxiliary_loss_mlp": 0.01031544, "balance_loss_clip": 1.04973578, "balance_loss_mlp": 1.02468634, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 2.211691472579568, "language_loss": 0.85322189, "learning_rate": 8.300693612881992e-09, "loss": 0.87519252, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 2.4732353687286377 }, { "auxiliary_loss_clip": 0.01149634, "auxiliary_loss_mlp": 0.00760804, "balance_loss_clip": 1.04652202, "balance_loss_mlp": 1.00037432, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 1.8367841699028, "language_loss": 0.81086242, "learning_rate": 8.22994770095664e-09, "loss": 0.82996678, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.492948293685913 }, { "auxiliary_loss_clip": 0.01133968, "auxiliary_loss_mlp": 0.0103103, "balance_loss_clip": 1.0483017, "balance_loss_mlp": 1.02385688, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 2.0567772561964044, "language_loss": 0.75549889, "learning_rate": 8.159503938964585e-09, "loss": 0.77714896, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 2.514460802078247 }, { "auxiliary_loss_clip": 0.01111672, "auxiliary_loss_mlp": 0.01028721, "balance_loss_clip": 1.04154742, "balance_loss_mlp": 1.02246261, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 2.0234816602474672, "language_loss": 0.7009306, "learning_rate": 8.089362337592164e-09, "loss": 0.7223345, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.6185779571533203 }, { "auxiliary_loss_clip": 0.01134102, "auxiliary_loss_mlp": 0.01026225, "balance_loss_clip": 1.0464915, "balance_loss_mlp": 1.01963305, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 1.5559959883015504, "language_loss": 0.72174954, "learning_rate": 8.019522907479536e-09, "loss": 0.74335277, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 2.610379457473755 }, { "auxiliary_loss_clip": 0.01151386, "auxiliary_loss_mlp": 0.01027153, "balance_loss_clip": 1.04642642, "balance_loss_mlp": 1.02008998, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.1070561135427113, "language_loss": 0.77498078, "learning_rate": 7.949985659221558e-09, "loss": 0.79676622, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 2.505854368209839 }, { "auxiliary_loss_clip": 0.01138107, "auxiliary_loss_mlp": 0.01030383, "balance_loss_clip": 1.04434991, "balance_loss_mlp": 1.02367687, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 1.9035924405317586, "language_loss": 0.78760314, "learning_rate": 7.880750603366904e-09, "loss": 0.80928802, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.5834314823150635 }, { "auxiliary_loss_clip": 0.0113118, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.04316616, "balance_loss_mlp": 1.02222204, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 1.7882575666380598, "language_loss": 0.79615492, "learning_rate": 7.811817750418282e-09, "loss": 0.81776285, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 2.5912866592407227 }, { "auxiliary_loss_clip": 0.01117942, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.04500079, "balance_loss_mlp": 1.02029145, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 1.5263123485955303, "language_loss": 0.80120122, "learning_rate": 7.743187110833105e-09, "loss": 0.82265753, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 3.4511404037475586 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.0102261, "balance_loss_clip": 1.04227948, "balance_loss_mlp": 1.01619065, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 1.4116855766068876, "language_loss": 0.80627573, "learning_rate": 7.674858695022602e-09, "loss": 0.82786751, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 3.327000856399536 }, { "auxiliary_loss_clip": 0.01166252, "auxiliary_loss_mlp": 0.01022249, "balance_loss_clip": 1.04886746, "balance_loss_mlp": 1.01463127, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 2.3331948240445417, "language_loss": 0.76291311, "learning_rate": 7.606832513351591e-09, "loss": 0.78479815, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.4912893772125244 }, { "auxiliary_loss_clip": 0.01054518, "auxiliary_loss_mlp": 0.00751194, "balance_loss_clip": 1.00776875, "balance_loss_mlp": 1.00017309, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.8192911684423997, "language_loss": 0.63923639, "learning_rate": 7.539108576140264e-09, "loss": 0.65729356, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.8809845447540283 }, { "auxiliary_loss_clip": 0.01105716, "auxiliary_loss_mlp": 0.01026162, "balance_loss_clip": 1.04143751, "balance_loss_mlp": 1.01942074, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 2.5597798674010104, "language_loss": 0.70629871, "learning_rate": 7.471686893661732e-09, "loss": 0.72761744, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 2.575995445251465 }, { "auxiliary_loss_clip": 0.01134082, "auxiliary_loss_mlp": 0.01025947, "balance_loss_clip": 1.0468061, "balance_loss_mlp": 1.01899385, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 1.8070783566164774, "language_loss": 0.64235735, "learning_rate": 7.4045674761442636e-09, "loss": 0.66395772, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.5133304595947266 }, { "auxiliary_loss_clip": 0.01161325, "auxiliary_loss_mlp": 0.00760721, "balance_loss_clip": 1.04728925, "balance_loss_mlp": 1.00031745, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 1.6667388396975555, "language_loss": 0.74245483, "learning_rate": 7.337750333769488e-09, "loss": 0.76167524, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 2.4868946075439453 }, { "auxiliary_loss_clip": 0.01139406, "auxiliary_loss_mlp": 0.0102772, "balance_loss_clip": 1.04232049, "balance_loss_mlp": 1.0198853, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 1.742743131935378, "language_loss": 0.72891945, "learning_rate": 7.2712354766737425e-09, "loss": 0.75059074, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.6347479820251465 }, { "auxiliary_loss_clip": 0.01112232, "auxiliary_loss_mlp": 0.01026971, "balance_loss_clip": 1.04446495, "balance_loss_mlp": 1.01991701, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 1.5450109162765466, "language_loss": 0.80678189, "learning_rate": 7.2050229149469565e-09, "loss": 0.82817394, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.5532073974609375 }, { "auxiliary_loss_clip": 0.01123507, "auxiliary_loss_mlp": 0.01022838, "balance_loss_clip": 1.04042423, "balance_loss_mlp": 1.01614165, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 1.7137882644764117, "language_loss": 0.63454342, "learning_rate": 7.139112658633984e-09, "loss": 0.65600687, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.6178789138793945 }, { "auxiliary_loss_clip": 0.01121181, "auxiliary_loss_mlp": 0.0102124, "balance_loss_clip": 1.04534793, "balance_loss_mlp": 1.01407301, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 2.0393717650867744, "language_loss": 0.70405692, "learning_rate": 7.073504717733048e-09, "loss": 0.72548115, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.6085379123687744 }, { "auxiliary_loss_clip": 0.01008449, "auxiliary_loss_mlp": 0.01002919, "balance_loss_clip": 1.00922012, "balance_loss_mlp": 1.00194192, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.7318306031136627, "language_loss": 0.57223362, "learning_rate": 7.008199102196855e-09, "loss": 0.59234726, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.120814085006714 }, { "auxiliary_loss_clip": 0.01031341, "auxiliary_loss_mlp": 0.01001373, "balance_loss_clip": 1.01064944, "balance_loss_mlp": 1.00043094, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.7937686289077596, "language_loss": 0.58983928, "learning_rate": 6.9431958219321464e-09, "loss": 0.61016637, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.091003179550171 }, { "auxiliary_loss_clip": 0.01134289, "auxiliary_loss_mlp": 0.01024649, "balance_loss_clip": 1.04358077, "balance_loss_mlp": 1.01740408, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.4682443897966269, "language_loss": 0.77720892, "learning_rate": 6.878494886800146e-09, "loss": 0.79879832, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.533651828765869 }, { "auxiliary_loss_clip": 0.01134014, "auxiliary_loss_mlp": 0.01025236, "balance_loss_clip": 1.04719496, "balance_loss_mlp": 1.01838171, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 1.8921676450977447, "language_loss": 0.76410925, "learning_rate": 6.814096306615669e-09, "loss": 0.78570175, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 3.308861017227173 }, { "auxiliary_loss_clip": 0.01139884, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 1.04345751, "balance_loss_mlp": 1.01904917, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.349906533584389, "language_loss": 0.64958918, "learning_rate": 6.750000091148011e-09, "loss": 0.67125171, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.494753837585449 }, { "auxiliary_loss_clip": 0.01164801, "auxiliary_loss_mlp": 0.01026875, "balance_loss_clip": 1.04883742, "balance_loss_mlp": 1.02001119, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 1.84913374105514, "language_loss": 0.72452414, "learning_rate": 6.686206250120729e-09, "loss": 0.74644095, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.5196568965911865 }, { "auxiliary_loss_clip": 0.01125218, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.04104519, "balance_loss_mlp": 1.01998186, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 1.851565209008299, "language_loss": 0.74677575, "learning_rate": 6.622714793210749e-09, "loss": 0.76829839, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.54463529586792 }, { "auxiliary_loss_clip": 0.01163905, "auxiliary_loss_mlp": 0.01022901, "balance_loss_clip": 1.04737616, "balance_loss_mlp": 1.01628184, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 1.7923424113228867, "language_loss": 0.78644866, "learning_rate": 6.559525730050364e-09, "loss": 0.80831671, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 2.4503021240234375 }, { "auxiliary_loss_clip": 0.0112522, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.04553723, "balance_loss_mlp": 1.02072012, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 3.4666589370651226, "language_loss": 0.75882512, "learning_rate": 6.496639070224574e-09, "loss": 0.78034985, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.5428624153137207 }, { "auxiliary_loss_clip": 0.01153931, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.0479846, "balance_loss_mlp": 1.02005374, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.1001098801539695, "language_loss": 0.83430654, "learning_rate": 6.4340548232739714e-09, "loss": 0.85611379, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 2.458155632019043 }, { "auxiliary_loss_clip": 0.01126026, "auxiliary_loss_mlp": 0.01025705, "balance_loss_clip": 1.04226553, "balance_loss_mlp": 1.01921654, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 2.0629284809186066, "language_loss": 0.79172021, "learning_rate": 6.371772998692071e-09, "loss": 0.81323755, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 2.6033778190612793 }, { "auxiliary_loss_clip": 0.01124401, "auxiliary_loss_mlp": 0.0102811, "balance_loss_clip": 1.04242039, "balance_loss_mlp": 1.02113879, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 3.313251058324904, "language_loss": 0.64667892, "learning_rate": 6.309793605927094e-09, "loss": 0.66820401, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.6252126693725586 }, { "auxiliary_loss_clip": 0.01137657, "auxiliary_loss_mlp": 0.01019222, "balance_loss_clip": 1.04499686, "balance_loss_mlp": 1.01214671, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 1.7670501539824324, "language_loss": 0.80257463, "learning_rate": 6.248116654381297e-09, "loss": 0.82414341, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 2.517439842224121 }, { "auxiliary_loss_clip": 0.01135949, "auxiliary_loss_mlp": 0.01019507, "balance_loss_clip": 1.04132771, "balance_loss_mlp": 1.01336145, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 1.619109069468923, "language_loss": 0.72603869, "learning_rate": 6.186742153410751e-09, "loss": 0.74759322, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.550299644470215 }, { "auxiliary_loss_clip": 0.01135344, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.04614377, "balance_loss_mlp": 1.0249964, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 2.0678730766693656, "language_loss": 0.87404859, "learning_rate": 6.125670112326453e-09, "loss": 0.89572257, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 2.547908306121826 }, { "auxiliary_loss_clip": 0.01147005, "auxiliary_loss_mlp": 0.01027857, "balance_loss_clip": 1.04207921, "balance_loss_mlp": 1.02074361, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 1.4563347907475137, "language_loss": 0.69758636, "learning_rate": 6.064900540392548e-09, "loss": 0.71933496, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 3.3527634143829346 }, { "auxiliary_loss_clip": 0.01130711, "auxiliary_loss_mlp": 0.01020712, "balance_loss_clip": 1.04597473, "balance_loss_mlp": 1.01478422, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 3.675106257629933, "language_loss": 0.78747511, "learning_rate": 6.0044334468278835e-09, "loss": 0.80898935, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 3.3055875301361084 }, { "auxiliary_loss_clip": 0.01105521, "auxiliary_loss_mlp": 0.01029235, "balance_loss_clip": 1.03974879, "balance_loss_mlp": 1.02204931, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 2.1004759457793636, "language_loss": 0.71790421, "learning_rate": 5.944268840805345e-09, "loss": 0.73925173, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 3.3869924545288086 }, { "auxiliary_loss_clip": 0.01115597, "auxiliary_loss_mlp": 0.01025611, "balance_loss_clip": 1.04155397, "balance_loss_mlp": 1.01903677, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 2.074114051420245, "language_loss": 0.64118737, "learning_rate": 5.88440673145163e-09, "loss": 0.66259944, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 2.5887725353240967 }, { "auxiliary_loss_clip": 0.01148902, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.04847777, "balance_loss_mlp": 1.02311385, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 2.4995641137639524, "language_loss": 0.8222388, "learning_rate": 5.824847127848142e-09, "loss": 0.84402746, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 2.4793779850006104 }, { "auxiliary_loss_clip": 0.01109721, "auxiliary_loss_mlp": 0.0102602, "balance_loss_clip": 1.04192913, "balance_loss_mlp": 1.01911211, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 1.8912771846638132, "language_loss": 0.79150844, "learning_rate": 5.765590039029433e-09, "loss": 0.81286585, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 2.6196846961975098 }, { "auxiliary_loss_clip": 0.01163517, "auxiliary_loss_mlp": 0.01027366, "balance_loss_clip": 1.04880714, "balance_loss_mlp": 1.02056551, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 1.830093689581446, "language_loss": 0.70972621, "learning_rate": 5.706635473985422e-09, "loss": 0.73163497, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.598020076751709 }, { "auxiliary_loss_clip": 0.01149677, "auxiliary_loss_mlp": 0.01027794, "balance_loss_clip": 1.04675031, "balance_loss_mlp": 1.02096367, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 1.7708314579665103, "language_loss": 0.84732628, "learning_rate": 5.6479834416591764e-09, "loss": 0.86910093, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 2.5231235027313232 }, { "auxiliary_loss_clip": 0.01148366, "auxiliary_loss_mlp": 0.00761227, "balance_loss_clip": 1.04634452, "balance_loss_mlp": 1.00029039, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 1.7649009470239023, "language_loss": 0.68388832, "learning_rate": 5.589633950947803e-09, "loss": 0.70298421, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.5520458221435547 }, { "auxiliary_loss_clip": 0.0113376, "auxiliary_loss_mlp": 0.01027976, "balance_loss_clip": 1.04494739, "balance_loss_mlp": 1.02035904, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 1.9622206196131053, "language_loss": 0.69693482, "learning_rate": 5.5315870107035535e-09, "loss": 0.71855217, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.5286264419555664 }, { "auxiliary_loss_clip": 0.01128625, "auxiliary_loss_mlp": 0.01024466, "balance_loss_clip": 1.04361653, "balance_loss_mlp": 1.01743853, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 1.8439173250890952, "language_loss": 0.78845292, "learning_rate": 5.473842629731607e-09, "loss": 0.80998385, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.52412486076355 }, { "auxiliary_loss_clip": 0.0114117, "auxiliary_loss_mlp": 0.00761316, "balance_loss_clip": 1.0429852, "balance_loss_mlp": 1.00033402, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 2.648472577256281, "language_loss": 0.778602, "learning_rate": 5.416400816792066e-09, "loss": 0.79762685, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.506303548812866 }, { "auxiliary_loss_clip": 0.01162325, "auxiliary_loss_mlp": 0.01026523, "balance_loss_clip": 1.04698205, "balance_loss_mlp": 1.01963866, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 2.3620208117501935, "language_loss": 0.78060627, "learning_rate": 5.359261580598407e-09, "loss": 0.80249476, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 2.464015007019043 }, { "auxiliary_loss_clip": 0.01149982, "auxiliary_loss_mlp": 0.01023565, "balance_loss_clip": 1.04575109, "balance_loss_mlp": 1.01599562, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 2.232291467392377, "language_loss": 0.78115386, "learning_rate": 5.302424929819027e-09, "loss": 0.80288935, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.4585084915161133 }, { "auxiliary_loss_clip": 0.01151011, "auxiliary_loss_mlp": 0.01023771, "balance_loss_clip": 1.0435369, "balance_loss_mlp": 1.01673174, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 2.4873863576204953, "language_loss": 0.73043025, "learning_rate": 5.24589087307592e-09, "loss": 0.75217807, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.4824674129486084 }, { "auxiliary_loss_clip": 0.0116469, "auxiliary_loss_mlp": 0.01025224, "balance_loss_clip": 1.047261, "balance_loss_mlp": 1.0185008, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 1.379042151878069, "language_loss": 0.64668667, "learning_rate": 5.189659418944891e-09, "loss": 0.66858578, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 3.559797525405884 }, { "auxiliary_loss_clip": 0.01163623, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.04829586, "balance_loss_mlp": 1.01903141, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 3.337282486063652, "language_loss": 0.78229892, "learning_rate": 5.133730575956674e-09, "loss": 0.80419111, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.456928253173828 }, { "auxiliary_loss_clip": 0.01135117, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.04385424, "balance_loss_mlp": 1.0259161, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 1.8958627347827732, "language_loss": 0.71999693, "learning_rate": 5.0781043525953696e-09, "loss": 0.74167836, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.522958993911743 }, { "auxiliary_loss_clip": 0.01133132, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.04692996, "balance_loss_mlp": 1.01973498, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.5636947504241716, "language_loss": 0.73686719, "learning_rate": 5.0227807572995605e-09, "loss": 0.75846505, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 2.546506643295288 }, { "auxiliary_loss_clip": 0.01138432, "auxiliary_loss_mlp": 0.01024041, "balance_loss_clip": 1.04434395, "balance_loss_mlp": 1.0173744, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 2.0613216496413864, "language_loss": 0.67396057, "learning_rate": 4.967759798461646e-09, "loss": 0.69558525, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.5524251461029053 }, { "auxiliary_loss_clip": 0.01162052, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.04799819, "balance_loss_mlp": 1.01998174, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 1.9920821027096027, "language_loss": 0.74743414, "learning_rate": 4.913041484428282e-09, "loss": 0.76931763, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 2.5039172172546387 }, { "auxiliary_loss_clip": 0.01150666, "auxiliary_loss_mlp": 0.01026306, "balance_loss_clip": 1.04572475, "balance_loss_mlp": 1.01974607, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 1.7335580604052399, "language_loss": 0.74004799, "learning_rate": 4.858625823500384e-09, "loss": 0.76181769, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 2.530212879180908 }, { "auxiliary_loss_clip": 0.01152927, "auxiliary_loss_mlp": 0.01026264, "balance_loss_clip": 1.04605591, "balance_loss_mlp": 1.0191983, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 1.783457638464804, "language_loss": 0.73589694, "learning_rate": 4.80451282393246e-09, "loss": 0.75768888, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 2.565626621246338 }, { "auxiliary_loss_clip": 0.01132408, "auxiliary_loss_mlp": 0.01026721, "balance_loss_clip": 1.04477429, "balance_loss_mlp": 1.02008152, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 1.9356718164200746, "language_loss": 0.67370594, "learning_rate": 4.750702493933722e-09, "loss": 0.69529724, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.6023685932159424 }, { "auxiliary_loss_clip": 0.01131475, "auxiliary_loss_mlp": 0.00760663, "balance_loss_clip": 1.04379165, "balance_loss_mlp": 1.00031805, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 1.8531392822683552, "language_loss": 0.85275543, "learning_rate": 4.697194841666974e-09, "loss": 0.87167686, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.5328710079193115 }, { "auxiliary_loss_clip": 0.01148913, "auxiliary_loss_mlp": 0.01025211, "balance_loss_clip": 1.04408312, "balance_loss_mlp": 1.0179038, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 2.0229016328631744, "language_loss": 0.81767964, "learning_rate": 4.6439898752492764e-09, "loss": 0.83942091, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 2.4916458129882812 }, { "auxiliary_loss_clip": 0.01045897, "auxiliary_loss_mlp": 0.00751165, "balance_loss_clip": 1.00840497, "balance_loss_mlp": 1.00023317, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.7512680982481615, "language_loss": 0.63699257, "learning_rate": 4.591087602751731e-09, "loss": 0.65496325, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.9472527503967285 }, { "auxiliary_loss_clip": 0.01147399, "auxiliary_loss_mlp": 0.01022887, "balance_loss_clip": 1.04554772, "balance_loss_mlp": 1.01613081, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 1.611340909500763, "language_loss": 0.71945065, "learning_rate": 4.538488032199916e-09, "loss": 0.74115348, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 3.290969133377075 }, { "auxiliary_loss_clip": 0.01150175, "auxiliary_loss_mlp": 0.01022784, "balance_loss_clip": 1.04336429, "balance_loss_mlp": 1.01554501, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 1.9266721278007346, "language_loss": 0.68979269, "learning_rate": 4.486191171572784e-09, "loss": 0.71152228, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 3.239699125289917 }, { "auxiliary_loss_clip": 0.01150144, "auxiliary_loss_mlp": 0.01025408, "balance_loss_clip": 1.0460577, "balance_loss_mlp": 1.01871991, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 1.827865833262024, "language_loss": 0.77738929, "learning_rate": 4.434197028803766e-09, "loss": 0.79914486, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.5160465240478516 }, { "auxiliary_loss_clip": 0.01124774, "auxiliary_loss_mlp": 0.01028424, "balance_loss_clip": 1.0405333, "balance_loss_mlp": 1.02154589, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 1.9806929308350252, "language_loss": 0.82265782, "learning_rate": 4.3825056117805514e-09, "loss": 0.84418976, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 2.5685462951660156 }, { "auxiliary_loss_clip": 0.01163813, "auxiliary_loss_mlp": 0.0102437, "balance_loss_clip": 1.04701364, "balance_loss_mlp": 1.01741147, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 2.0551798304332283, "language_loss": 0.79296279, "learning_rate": 4.331116928344425e-09, "loss": 0.81484455, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.4252238273620605 }, { "auxiliary_loss_clip": 0.0114019, "auxiliary_loss_mlp": 0.0076096, "balance_loss_clip": 1.04375589, "balance_loss_mlp": 1.00028753, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 2.0273246086046766, "language_loss": 0.62441576, "learning_rate": 4.28003098629115e-09, "loss": 0.64342725, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.5265822410583496 }, { "auxiliary_loss_clip": 0.01115449, "auxiliary_loss_mlp": 0.01023977, "balance_loss_clip": 1.03632116, "balance_loss_mlp": 1.01725399, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 1.6846564874758947, "language_loss": 0.78381091, "learning_rate": 4.229247793370305e-09, "loss": 0.80520517, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.5859546661376953 }, { "auxiliary_loss_clip": 0.01164656, "auxiliary_loss_mlp": 0.01028653, "balance_loss_clip": 1.04818368, "balance_loss_mlp": 1.02172685, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.5806613636903595, "language_loss": 0.70384502, "learning_rate": 4.178767357285951e-09, "loss": 0.7257781, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 2.5207607746124268 }, { "auxiliary_loss_clip": 0.01150257, "auxiliary_loss_mlp": 0.00760564, "balance_loss_clip": 1.0465585, "balance_loss_mlp": 1.00031447, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 2.4914055156214086, "language_loss": 0.7162239, "learning_rate": 4.128589685695516e-09, "loss": 0.73533213, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.547163724899292 }, { "auxiliary_loss_clip": 0.01162251, "auxiliary_loss_mlp": 0.01033398, "balance_loss_clip": 1.04668212, "balance_loss_mlp": 1.0265491, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 1.9612872546750713, "language_loss": 0.84468496, "learning_rate": 4.078714786211135e-09, "loss": 0.86664146, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.425194501876831 }, { "auxiliary_loss_clip": 0.01147926, "auxiliary_loss_mlp": 0.01025418, "balance_loss_clip": 1.04599524, "balance_loss_mlp": 1.01916528, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 1.6359488173187708, "language_loss": 0.76825488, "learning_rate": 4.029142666398977e-09, "loss": 0.78998828, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.5276401042938232 }, { "auxiliary_loss_clip": 0.01160425, "auxiliary_loss_mlp": 0.01029032, "balance_loss_clip": 1.04720986, "balance_loss_mlp": 1.02241588, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 1.807123162345723, "language_loss": 0.79807222, "learning_rate": 3.979873333778805e-09, "loss": 0.81996679, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.4637038707733154 }, { "auxiliary_loss_clip": 0.01140608, "auxiliary_loss_mlp": 0.01024811, "balance_loss_clip": 1.0459919, "balance_loss_mlp": 1.01807821, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 1.944079679902687, "language_loss": 0.73945308, "learning_rate": 3.930906795824862e-09, "loss": 0.76110721, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.6672439575195312 }, { "auxiliary_loss_clip": 0.01145554, "auxiliary_loss_mlp": 0.01024052, "balance_loss_clip": 1.04515743, "balance_loss_mlp": 1.01749516, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 2.243285063647544, "language_loss": 0.76883078, "learning_rate": 3.882243059965207e-09, "loss": 0.79052675, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 3.195885181427002 }, { "auxiliary_loss_clip": 0.01141307, "auxiliary_loss_mlp": 0.0102517, "balance_loss_clip": 1.04347301, "balance_loss_mlp": 1.01805639, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 3.082380597786366, "language_loss": 0.65473318, "learning_rate": 3.833882133582156e-09, "loss": 0.67639798, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.4624996185302734 }, { "auxiliary_loss_clip": 0.01150785, "auxiliary_loss_mlp": 0.01026981, "balance_loss_clip": 1.04545939, "balance_loss_mlp": 1.02066338, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 1.5425903302566295, "language_loss": 0.77942395, "learning_rate": 3.785824024012285e-09, "loss": 0.80120158, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.4978280067443848 }, { "auxiliary_loss_clip": 0.0112631, "auxiliary_loss_mlp": 0.01030107, "balance_loss_clip": 1.04366827, "balance_loss_mlp": 1.02387261, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.523929864464651, "language_loss": 0.78236234, "learning_rate": 3.738068738545541e-09, "loss": 0.80392647, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.530754566192627 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01030112, "balance_loss_clip": 1.04649401, "balance_loss_mlp": 1.02295971, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 2.1719848361027903, "language_loss": 0.78346843, "learning_rate": 3.6906162844265733e-09, "loss": 0.8052966, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 2.4768030643463135 }, { "auxiliary_loss_clip": 0.0113101, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.04454792, "balance_loss_mlp": 1.02485943, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 1.936235984993442, "language_loss": 0.70592302, "learning_rate": 3.643466668853845e-09, "loss": 0.7275511, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.540848970413208 }, { "auxiliary_loss_clip": 0.01137902, "auxiliary_loss_mlp": 0.01026633, "balance_loss_clip": 1.04392147, "balance_loss_mlp": 1.02026391, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 1.90135152675509, "language_loss": 0.75154173, "learning_rate": 3.59661989898008e-09, "loss": 0.7731871, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 2.59548282623291 }, { "auxiliary_loss_clip": 0.01116425, "auxiliary_loss_mlp": 0.01024571, "balance_loss_clip": 1.0449959, "balance_loss_mlp": 1.0179193, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 1.6147359137053718, "language_loss": 0.76838565, "learning_rate": 3.5500759819115934e-09, "loss": 0.78979558, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 2.5875895023345947 }, { "auxiliary_loss_clip": 0.01166815, "auxiliary_loss_mlp": 0.01030114, "balance_loss_clip": 1.0508287, "balance_loss_mlp": 1.02325988, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 2.1436356060995982, "language_loss": 0.80874419, "learning_rate": 3.5038349247094034e-09, "loss": 0.83071351, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 2.45546293258667 }, { "auxiliary_loss_clip": 0.01135142, "auxiliary_loss_mlp": 0.01024724, "balance_loss_clip": 1.0424993, "balance_loss_mlp": 1.01777995, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 2.0860430882892826, "language_loss": 0.77082467, "learning_rate": 3.4578967343878994e-09, "loss": 0.79242337, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.5021111965179443 }, { "auxiliary_loss_clip": 0.01134597, "auxiliary_loss_mlp": 0.01023959, "balance_loss_clip": 1.04554451, "balance_loss_mlp": 1.01714611, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 1.7193013271618676, "language_loss": 0.80814213, "learning_rate": 3.4122614179161733e-09, "loss": 0.82972765, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.5480239391326904 }, { "auxiliary_loss_clip": 0.01107681, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 1.03865242, "balance_loss_mlp": 1.02103114, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 1.6451849046050369, "language_loss": 0.77908462, "learning_rate": 3.36692898221691e-09, "loss": 0.80043781, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 2.5637834072113037 }, { "auxiliary_loss_clip": 0.01147385, "auxiliary_loss_mlp": 0.01025606, "balance_loss_clip": 1.0446595, "balance_loss_mlp": 1.01954091, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 1.6167771871881964, "language_loss": 0.73665965, "learning_rate": 3.3218994341668305e-09, "loss": 0.75838953, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 2.4940452575683594 }, { "auxiliary_loss_clip": 0.01163367, "auxiliary_loss_mlp": 0.01029162, "balance_loss_clip": 1.04997993, "balance_loss_mlp": 1.0226531, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.5734498918607238, "language_loss": 0.75553048, "learning_rate": 3.2771727805971373e-09, "loss": 0.77745575, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 3.3301572799682617 }, { "auxiliary_loss_clip": 0.0110206, "auxiliary_loss_mlp": 0.01027451, "balance_loss_clip": 1.03947198, "balance_loss_mlp": 1.02011633, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 1.77313643247294, "language_loss": 0.77467692, "learning_rate": 3.232749028292847e-09, "loss": 0.79597199, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 3.370128631591797 }, { "auxiliary_loss_clip": 0.01164443, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.04705179, "balance_loss_mlp": 1.02237439, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 1.6360489136248668, "language_loss": 0.88105559, "learning_rate": 3.188628183992792e-09, "loss": 0.90299618, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.472970724105835 }, { "auxiliary_loss_clip": 0.01045749, "auxiliary_loss_mlp": 0.01001841, "balance_loss_clip": 1.00737262, "balance_loss_mlp": 1.00078571, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7403031592516569, "language_loss": 0.62527519, "learning_rate": 3.1448102543902844e-09, "loss": 0.64575112, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 3.0478413105010986 }, { "auxiliary_loss_clip": 0.01128375, "auxiliary_loss_mlp": 0.01027192, "balance_loss_clip": 1.04328918, "balance_loss_mlp": 1.0203644, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 1.9463393444583774, "language_loss": 0.67570108, "learning_rate": 3.1012952461324515e-09, "loss": 0.69725674, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 2.5091633796691895 }, { "auxiliary_loss_clip": 0.01149032, "auxiliary_loss_mlp": 0.010253, "balance_loss_clip": 1.0477891, "balance_loss_mlp": 1.01845717, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 2.14252282874822, "language_loss": 0.73904163, "learning_rate": 3.0580831658204575e-09, "loss": 0.76078492, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.4705028533935547 }, { "auxiliary_loss_clip": 0.01149618, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 1.0478164, "balance_loss_mlp": 1.01959848, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 1.4709225684114746, "language_loss": 0.77836394, "learning_rate": 3.015174020009281e-09, "loss": 0.80012363, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.5138156414031982 }, { "auxiliary_loss_clip": 0.01121332, "auxiliary_loss_mlp": 0.01022186, "balance_loss_clip": 1.03983045, "balance_loss_mlp": 1.01562333, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 2.0264487779550513, "language_loss": 0.75084436, "learning_rate": 2.9725678152086043e-09, "loss": 0.7722795, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 2.571632146835327 }, { "auxiliary_loss_clip": 0.01122336, "auxiliary_loss_mlp": 0.01025216, "balance_loss_clip": 1.04176581, "balance_loss_mlp": 1.0181644, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 3.54638297370488, "language_loss": 0.82084394, "learning_rate": 2.930264557881257e-09, "loss": 0.84231943, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.482571601867676 }, { "auxiliary_loss_clip": 0.01054537, "auxiliary_loss_mlp": 0.01002671, "balance_loss_clip": 1.00779736, "balance_loss_mlp": 1.00159192, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.837948524885748, "language_loss": 0.582165, "learning_rate": 2.8882642544452163e-09, "loss": 0.60273707, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.089775562286377 }, { "auxiliary_loss_clip": 0.01125621, "auxiliary_loss_mlp": 0.0102081, "balance_loss_clip": 1.04209661, "balance_loss_mlp": 1.01400924, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 2.147168718196381, "language_loss": 0.7452659, "learning_rate": 2.8465669112716083e-09, "loss": 0.76673025, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.486344814300537 }, { "auxiliary_loss_clip": 0.01150106, "auxiliary_loss_mlp": 0.00761714, "balance_loss_clip": 1.04458237, "balance_loss_mlp": 1.00031996, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 1.9981548481843108, "language_loss": 0.76324677, "learning_rate": 2.8051725346858177e-09, "loss": 0.78236496, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 2.508835554122925 }, { "auxiliary_loss_clip": 0.0116353, "auxiliary_loss_mlp": 0.01025067, "balance_loss_clip": 1.04562807, "balance_loss_mlp": 1.01797724, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 1.9940841843442412, "language_loss": 0.71152163, "learning_rate": 2.7640811309674883e-09, "loss": 0.7334075, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.501286506652832 }, { "auxiliary_loss_clip": 0.01113113, "auxiliary_loss_mlp": 0.01024681, "balance_loss_clip": 1.04401636, "balance_loss_mlp": 1.01770735, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 1.6707796148732172, "language_loss": 0.80710185, "learning_rate": 2.7232927063498557e-09, "loss": 0.82847977, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 3.3174688816070557 }, { "auxiliary_loss_clip": 0.01148228, "auxiliary_loss_mlp": 0.01025948, "balance_loss_clip": 1.04555023, "balance_loss_mlp": 1.01896858, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 1.8327314822473735, "language_loss": 0.69184673, "learning_rate": 2.682807267020859e-09, "loss": 0.71358848, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.6475791931152344 }, { "auxiliary_loss_clip": 0.01147294, "auxiliary_loss_mlp": 0.01029271, "balance_loss_clip": 1.04522562, "balance_loss_mlp": 1.02233005, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 1.5488926685734743, "language_loss": 0.62547517, "learning_rate": 2.642624819121808e-09, "loss": 0.64724076, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.532592296600342 }, { "auxiliary_loss_clip": 0.0113582, "auxiliary_loss_mlp": 0.01028416, "balance_loss_clip": 1.04625916, "balance_loss_mlp": 1.02200222, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 2.001601571641615, "language_loss": 0.61615413, "learning_rate": 2.6027453687487154e-09, "loss": 0.63779652, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.491265058517456 }, { "auxiliary_loss_clip": 0.01135011, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.04439139, "balance_loss_mlp": 1.02129292, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.2284485073033244, "language_loss": 0.53784055, "learning_rate": 2.5631689219509643e-09, "loss": 0.55947852, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.541133165359497 }, { "auxiliary_loss_clip": 0.01133447, "auxiliary_loss_mlp": 0.01024994, "balance_loss_clip": 1.04689288, "balance_loss_mlp": 1.01868439, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.6577787944144287, "language_loss": 0.83786309, "learning_rate": 2.523895484732197e-09, "loss": 0.85944748, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.5377063751220703 }, { "auxiliary_loss_clip": 0.01153862, "auxiliary_loss_mlp": 0.01025956, "balance_loss_clip": 1.04559278, "balance_loss_mlp": 1.01889884, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 1.9360269030646078, "language_loss": 0.74586439, "learning_rate": 2.4849250630505357e-09, "loss": 0.76766253, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.4962258338928223 }, { "auxiliary_loss_clip": 0.01069828, "auxiliary_loss_mlp": 0.01024351, "balance_loss_clip": 1.03855348, "balance_loss_mlp": 1.01779437, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.7174683639460764, "language_loss": 0.72983849, "learning_rate": 2.4462576628172528e-09, "loss": 0.75078028, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 2.6770622730255127 }, { "auxiliary_loss_clip": 0.01148284, "auxiliary_loss_mlp": 0.01028986, "balance_loss_clip": 1.04738927, "balance_loss_mlp": 1.02219677, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 1.7361001528416078, "language_loss": 0.73796427, "learning_rate": 2.407893289898766e-09, "loss": 0.75973701, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 2.4910712242126465 }, { "auxiliary_loss_clip": 0.01115925, "auxiliary_loss_mlp": 0.01023943, "balance_loss_clip": 1.04220855, "balance_loss_mlp": 1.01725245, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 1.9460504451299345, "language_loss": 0.83923459, "learning_rate": 2.3698319501144202e-09, "loss": 0.86063337, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.5962395668029785 }, { "auxiliary_loss_clip": 0.0115095, "auxiliary_loss_mlp": 0.01024646, "balance_loss_clip": 1.04669356, "balance_loss_mlp": 1.01764596, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 1.554356296684526, "language_loss": 0.73114651, "learning_rate": 2.3320736492382644e-09, "loss": 0.75290251, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.470937967300415 }, { "auxiliary_loss_clip": 0.01159692, "auxiliary_loss_mlp": 0.01025763, "balance_loss_clip": 1.04689169, "balance_loss_mlp": 1.01916146, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 2.2062386970828256, "language_loss": 0.67761075, "learning_rate": 2.29461839299816e-09, "loss": 0.69946539, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 2.4660072326660156 }, { "auxiliary_loss_clip": 0.01120551, "auxiliary_loss_mlp": 0.01022984, "balance_loss_clip": 1.04248714, "balance_loss_mlp": 1.01652241, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.522430066956628, "language_loss": 0.79833531, "learning_rate": 2.257466187076229e-09, "loss": 0.81977069, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.5858240127563477 }, { "auxiliary_loss_clip": 0.01153673, "auxiliary_loss_mlp": 0.00760894, "balance_loss_clip": 1.04569829, "balance_loss_mlp": 1.00025594, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 1.7394319295092493, "language_loss": 0.71054125, "learning_rate": 2.2206170371081854e-09, "loss": 0.72968698, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 3.260167121887207 }, { "auxiliary_loss_clip": 0.01135969, "auxiliary_loss_mlp": 0.01025883, "balance_loss_clip": 1.0438931, "balance_loss_mlp": 1.0192225, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.6220214230925114, "language_loss": 0.84774864, "learning_rate": 2.1840709486842247e-09, "loss": 0.86936718, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 3.356483221054077 }, { "auxiliary_loss_clip": 0.0112816, "auxiliary_loss_mlp": 0.01034031, "balance_loss_clip": 1.04369676, "balance_loss_mlp": 1.02709615, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 2.0303278437867434, "language_loss": 0.79370087, "learning_rate": 2.1478279273481335e-09, "loss": 0.81532276, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.4991955757141113 }, { "auxiliary_loss_clip": 0.01148086, "auxiliary_loss_mlp": 0.01027056, "balance_loss_clip": 1.04735041, "balance_loss_mlp": 1.02015352, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.0942337449043946, "language_loss": 0.79973018, "learning_rate": 2.1118879785981815e-09, "loss": 0.82148159, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 2.608306646347046 }, { "auxiliary_loss_clip": 0.01132618, "auxiliary_loss_mlp": 0.01022905, "balance_loss_clip": 1.04415452, "balance_loss_mlp": 1.01602328, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 2.232802340793844, "language_loss": 0.79398656, "learning_rate": 2.0762511078862288e-09, "loss": 0.8155418, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 2.5608739852905273 }, { "auxiliary_loss_clip": 0.0113902, "auxiliary_loss_mlp": 0.01023708, "balance_loss_clip": 1.04515886, "balance_loss_mlp": 1.01715171, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 2.962773814585931, "language_loss": 0.64849603, "learning_rate": 2.0409173206186183e-09, "loss": 0.67012328, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.568518877029419 }, { "auxiliary_loss_clip": 0.01118744, "auxiliary_loss_mlp": 0.0102324, "balance_loss_clip": 1.04428244, "balance_loss_mlp": 1.01658225, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 1.8272063480607426, "language_loss": 0.87122774, "learning_rate": 2.0058866221550617e-09, "loss": 0.89264762, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.5547945499420166 }, { "auxiliary_loss_clip": 0.01163604, "auxiliary_loss_mlp": 0.01022761, "balance_loss_clip": 1.04645944, "balance_loss_mlp": 1.01557899, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 2.193537908167759, "language_loss": 0.74987245, "learning_rate": 1.971159017809976e-09, "loss": 0.77173615, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.5150418281555176 }, { "auxiliary_loss_clip": 0.01150488, "auxiliary_loss_mlp": 0.01031273, "balance_loss_clip": 1.04754865, "balance_loss_mlp": 1.02417433, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 2.793705109920593, "language_loss": 0.77965641, "learning_rate": 1.93673451285159e-09, "loss": 0.80147398, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.4971063137054443 }, { "auxiliary_loss_clip": 0.01035985, "auxiliary_loss_mlp": 0.01003809, "balance_loss_clip": 1.00736725, "balance_loss_mlp": 1.00273645, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.7498271653482068, "language_loss": 0.56573653, "learning_rate": 1.9026131125019495e-09, "loss": 0.58613443, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 3.0454232692718506 }, { "auxiliary_loss_clip": 0.01142332, "auxiliary_loss_mlp": 0.01025106, "balance_loss_clip": 1.04477835, "balance_loss_mlp": 1.0185194, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 1.7962566798958002, "language_loss": 0.8679539, "learning_rate": 1.8687948219371363e-09, "loss": 0.88962829, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.4930319786071777 }, { "auxiliary_loss_clip": 0.01164306, "auxiliary_loss_mlp": 0.01024172, "balance_loss_clip": 1.04511404, "balance_loss_mlp": 1.01668513, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 1.996033335828532, "language_loss": 0.88128829, "learning_rate": 1.835279646287491e-09, "loss": 0.90317303, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 2.4537179470062256 }, { "auxiliary_loss_clip": 0.01158139, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.04872131, "balance_loss_mlp": 1.02980685, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 3.206933660395623, "language_loss": 0.76368988, "learning_rate": 1.8020675906371685e-09, "loss": 0.7856406, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.5134470462799072 }, { "auxiliary_loss_clip": 0.0110676, "auxiliary_loss_mlp": 0.01030019, "balance_loss_clip": 1.0414598, "balance_loss_mlp": 1.02328944, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 3.983203842884186, "language_loss": 0.75255853, "learning_rate": 1.7691586600243612e-09, "loss": 0.77392626, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.622799873352051 }, { "auxiliary_loss_clip": 0.01131381, "auxiliary_loss_mlp": 0.01026236, "balance_loss_clip": 1.04603016, "balance_loss_mlp": 1.01971531, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 2.2645542638476392, "language_loss": 0.86302316, "learning_rate": 1.7365528594415202e-09, "loss": 0.88459933, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 3.2535054683685303 }, { "auxiliary_loss_clip": 0.01153677, "auxiliary_loss_mlp": 0.00761043, "balance_loss_clip": 1.04635656, "balance_loss_mlp": 1.00031209, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 1.5976337467065016, "language_loss": 0.6728065, "learning_rate": 1.7042501938346888e-09, "loss": 0.69195366, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.6417734622955322 }, { "auxiliary_loss_clip": 0.01122655, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 1.03955686, "balance_loss_mlp": 1.01805091, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 1.8517183646738022, "language_loss": 0.76141292, "learning_rate": 1.6722506681043913e-09, "loss": 0.78288245, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.521176338195801 }, { "auxiliary_loss_clip": 0.01138876, "auxiliary_loss_mlp": 0.01031927, "balance_loss_clip": 1.04464686, "balance_loss_mlp": 1.02525151, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.1497479606657097, "language_loss": 0.69620323, "learning_rate": 1.640554287104745e-09, "loss": 0.7179113, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.4906036853790283 }, { "auxiliary_loss_clip": 0.01122875, "auxiliary_loss_mlp": 0.01022217, "balance_loss_clip": 1.03986454, "balance_loss_mlp": 1.01468611, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 2.013528553741755, "language_loss": 0.79918098, "learning_rate": 1.609161055644348e-09, "loss": 0.82063198, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.536616086959839 }, { "auxiliary_loss_clip": 0.01154903, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.04550719, "balance_loss_mlp": 1.01810479, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 3.450638883567828, "language_loss": 0.68048888, "learning_rate": 1.5780709784849467e-09, "loss": 0.70229292, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 2.5331637859344482 }, { "auxiliary_loss_clip": 0.01098144, "auxiliary_loss_mlp": 0.01023337, "balance_loss_clip": 1.042045, "balance_loss_mlp": 1.01654243, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 1.8151629525076283, "language_loss": 0.82468772, "learning_rate": 1.5472840603436565e-09, "loss": 0.8459025, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 2.5798938274383545 }, { "auxiliary_loss_clip": 0.01135109, "auxiliary_loss_mlp": 0.01021263, "balance_loss_clip": 1.04520011, "balance_loss_mlp": 1.0145483, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 2.0134818481240244, "language_loss": 0.78353298, "learning_rate": 1.5168003058900757e-09, "loss": 0.80509675, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 2.519059896469116 }, { "auxiliary_loss_clip": 0.01120079, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04205334, "balance_loss_mlp": 1.01897287, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 1.9036590323907412, "language_loss": 0.92143029, "learning_rate": 1.4866197197491715e-09, "loss": 0.94288325, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 2.5625405311584473 }, { "auxiliary_loss_clip": 0.01151728, "auxiliary_loss_mlp": 0.00761152, "balance_loss_clip": 1.04529512, "balance_loss_mlp": 1.00034046, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 3.1567063740449544, "language_loss": 0.7880404, "learning_rate": 1.4567423064988371e-09, "loss": 0.8071692, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.5072479248046875 }, { "auxiliary_loss_clip": 0.01164529, "auxiliary_loss_mlp": 0.01024509, "balance_loss_clip": 1.04771662, "balance_loss_mlp": 1.01802969, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 1.975171414687983, "language_loss": 0.78038013, "learning_rate": 1.4271680706718913e-09, "loss": 0.80227053, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 2.4703071117401123 }, { "auxiliary_loss_clip": 0.01150343, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.04665148, "balance_loss_mlp": 1.02440667, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 1.5987444560432211, "language_loss": 0.8229087, "learning_rate": 1.3978970167543013e-09, "loss": 0.8447293, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.561683177947998 }, { "auxiliary_loss_clip": 0.01127812, "auxiliary_loss_mlp": 0.01026548, "balance_loss_clip": 1.04373085, "balance_loss_mlp": 1.01929426, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.364618533429402, "language_loss": 0.77380878, "learning_rate": 1.3689291491867372e-09, "loss": 0.7953524, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 3.2372937202453613 }, { "auxiliary_loss_clip": 0.01164601, "auxiliary_loss_mlp": 0.01027007, "balance_loss_clip": 1.04731178, "balance_loss_mlp": 1.01975906, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 1.921853861705802, "language_loss": 0.73580325, "learning_rate": 1.3402644723636836e-09, "loss": 0.75771928, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 3.2701165676116943 }, { "auxiliary_loss_clip": 0.01131952, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.04692733, "balance_loss_mlp": 1.01980627, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 2.605076301146215, "language_loss": 0.83521318, "learning_rate": 1.311902990633218e-09, "loss": 0.85679924, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.559272050857544 }, { "auxiliary_loss_clip": 0.01123821, "auxiliary_loss_mlp": 0.0102579, "balance_loss_clip": 1.03776658, "balance_loss_mlp": 1.01902509, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 1.626584511073724, "language_loss": 0.71252638, "learning_rate": 1.2838447082978987e-09, "loss": 0.7340225, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 2.5671772956848145 }, { "auxiliary_loss_clip": 0.01145468, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.0429107, "balance_loss_mlp": 1.01980293, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 2.554812434572028, "language_loss": 0.83076161, "learning_rate": 1.2560896296143208e-09, "loss": 0.85248631, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.4873874187469482 }, { "auxiliary_loss_clip": 0.01161744, "auxiliary_loss_mlp": 0.01031345, "balance_loss_clip": 1.04676151, "balance_loss_mlp": 1.02448404, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 2.3495612003462685, "language_loss": 0.82457685, "learning_rate": 1.2286377587926722e-09, "loss": 0.84650779, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.45072078704834 }, { "auxiliary_loss_clip": 0.01161806, "auxiliary_loss_mlp": 0.01027999, "balance_loss_clip": 1.045506, "balance_loss_mlp": 1.02109683, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 1.9790991968782994, "language_loss": 0.74568784, "learning_rate": 1.2014890999973992e-09, "loss": 0.76758587, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.470072031021118 }, { "auxiliary_loss_clip": 0.011592, "auxiliary_loss_mlp": 0.01022905, "balance_loss_clip": 1.04446363, "balance_loss_mlp": 1.01611948, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 1.4224390814015884, "language_loss": 0.78233516, "learning_rate": 1.1746436573472073e-09, "loss": 0.80415618, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 2.5024070739746094 }, { "auxiliary_loss_clip": 0.01144111, "auxiliary_loss_mlp": 0.01028103, "balance_loss_clip": 1.04526353, "balance_loss_mlp": 1.02089095, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 1.9558620046210509, "language_loss": 0.69219267, "learning_rate": 1.1481014349141726e-09, "loss": 0.71391475, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.483761787414551 }, { "auxiliary_loss_clip": 0.01136575, "auxiliary_loss_mlp": 0.01030235, "balance_loss_clip": 1.04510188, "balance_loss_mlp": 1.02309096, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 1.9585461652833278, "language_loss": 0.84365648, "learning_rate": 1.121862436724852e-09, "loss": 0.86532462, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.557873487472534 }, { "auxiliary_loss_clip": 0.0115073, "auxiliary_loss_mlp": 0.01026706, "balance_loss_clip": 1.04829359, "balance_loss_mlp": 1.01961899, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.7210756587880818, "language_loss": 0.70398289, "learning_rate": 1.0959266667598388e-09, "loss": 0.7257573, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.4850831031799316 }, { "auxiliary_loss_clip": 0.01124728, "auxiliary_loss_mlp": 0.01027234, "balance_loss_clip": 1.04511976, "balance_loss_mlp": 1.01941407, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 1.8996835612537466, "language_loss": 0.74465835, "learning_rate": 1.0702941289533196e-09, "loss": 0.76617795, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 2.561312675476074 }, { "auxiliary_loss_clip": 0.01120903, "auxiliary_loss_mlp": 0.01021785, "balance_loss_clip": 1.04407215, "balance_loss_mlp": 1.01529741, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 2.013369576142551, "language_loss": 0.88509655, "learning_rate": 1.0449648271939615e-09, "loss": 0.90652347, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.519787311553955 }, { "auxiliary_loss_clip": 0.01110563, "auxiliary_loss_mlp": 0.00760718, "balance_loss_clip": 1.04354286, "balance_loss_mlp": 1.00030005, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 1.5738199474029242, "language_loss": 0.72332215, "learning_rate": 1.0199387653240243e-09, "loss": 0.74203503, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.620490789413452 }, { "auxiliary_loss_clip": 0.01129792, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.04358482, "balance_loss_mlp": 1.02179849, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.571913397748716, "language_loss": 0.70571208, "learning_rate": 9.952159471400267e-10, "loss": 0.72729319, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 3.262352466583252 }, { "auxiliary_loss_clip": 0.01146125, "auxiliary_loss_mlp": 0.00760843, "balance_loss_clip": 1.04368079, "balance_loss_mlp": 1.00033343, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 1.8115604247725943, "language_loss": 0.84358883, "learning_rate": 9.707963763923022e-10, "loss": 0.86265856, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.5048279762268066 }, { "auxiliary_loss_clip": 0.01132087, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.04136646, "balance_loss_mlp": 1.02265596, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 1.712476354820829, "language_loss": 0.79204631, "learning_rate": 9.466800567854427e-10, "loss": 0.81366211, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.512277603149414 }, { "auxiliary_loss_clip": 0.01116969, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.03966057, "balance_loss_mlp": 1.02526832, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 1.9871302730263303, "language_loss": 0.68110609, "learning_rate": 9.228669919778553e-10, "loss": 0.70260483, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.598832607269287 }, { "auxiliary_loss_clip": 0.01126816, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.04293346, "balance_loss_mlp": 1.02220201, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 2.5266020650754917, "language_loss": 0.79402167, "learning_rate": 8.993571855817617e-10, "loss": 0.81558788, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 2.5466606616973877 }, { "auxiliary_loss_clip": 0.01146196, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.0442344, "balance_loss_mlp": 1.02205384, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 1.726437701185197, "language_loss": 0.74972498, "learning_rate": 8.761506411638642e-10, "loss": 0.77147758, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 2.504885196685791 }, { "auxiliary_loss_clip": 0.01132727, "auxiliary_loss_mlp": 0.0102744, "balance_loss_clip": 1.04496789, "balance_loss_mlp": 1.02053499, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.6442008899685288, "language_loss": 0.73800987, "learning_rate": 8.53247362244236e-10, "loss": 0.75961155, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.620187997817993 }, { "auxiliary_loss_clip": 0.01135702, "auxiliary_loss_mlp": 0.01025289, "balance_loss_clip": 1.04525709, "balance_loss_mlp": 1.01841688, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.624548295150006, "language_loss": 0.68210828, "learning_rate": 8.306473522976532e-10, "loss": 0.70371819, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 2.6017086505889893 }, { "auxiliary_loss_clip": 0.01163104, "auxiliary_loss_mlp": 0.01023171, "balance_loss_clip": 1.04809892, "balance_loss_mlp": 1.01636446, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 1.841881164736898, "language_loss": 0.71777678, "learning_rate": 8.083506147522623e-10, "loss": 0.73963952, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.4943394660949707 }, { "auxiliary_loss_clip": 0.01144494, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.04450583, "balance_loss_mlp": 1.02518296, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.0285459582719585, "language_loss": 0.85283607, "learning_rate": 7.863571529906909e-10, "loss": 0.87460172, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.461913585662842 }, { "auxiliary_loss_clip": 0.01045539, "auxiliary_loss_mlp": 0.01002473, "balance_loss_clip": 1.00771713, "balance_loss_mlp": 1.00138807, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.723432033885823, "language_loss": 0.5969578, "learning_rate": 7.646669703489372e-10, "loss": 0.61743796, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 3.191058874130249 }, { "auxiliary_loss_clip": 0.0105614, "auxiliary_loss_mlp": 0.01025123, "balance_loss_clip": 1.03320813, "balance_loss_mlp": 1.01825953, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 2.000644186691286, "language_loss": 0.57299036, "learning_rate": 7.432800701177023e-10, "loss": 0.59380305, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 2.9199795722961426 }, { "auxiliary_loss_clip": 0.01038188, "auxiliary_loss_mlp": 0.01003859, "balance_loss_clip": 1.01003313, "balance_loss_mlp": 1.00272608, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7970068630094421, "language_loss": 0.57820463, "learning_rate": 7.221964555415017e-10, "loss": 0.59862506, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.3220455646514893 }, { "auxiliary_loss_clip": 0.01131756, "auxiliary_loss_mlp": 0.01025492, "balance_loss_clip": 1.04293537, "balance_loss_mlp": 1.01891792, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 1.7686047441929207, "language_loss": 0.74613559, "learning_rate": 7.01416129818222e-10, "loss": 0.76770806, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 5.224648475646973 }, { "auxiliary_loss_clip": 0.01129829, "auxiliary_loss_mlp": 0.01033557, "balance_loss_clip": 1.0453217, "balance_loss_mlp": 1.02676797, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 1.86511210999876, "language_loss": 0.58109784, "learning_rate": 6.809390961006745e-10, "loss": 0.60273176, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.628622531890869 }, { "auxiliary_loss_clip": 0.01133489, "auxiliary_loss_mlp": 0.01027643, "balance_loss_clip": 1.04473758, "balance_loss_mlp": 1.0209372, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 1.7874624000010249, "language_loss": 0.6868434, "learning_rate": 6.607653574948191e-10, "loss": 0.70845461, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 2.5689046382904053 }, { "auxiliary_loss_clip": 0.01140737, "auxiliary_loss_mlp": 0.0102462, "balance_loss_clip": 1.0421226, "balance_loss_mlp": 1.0180968, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 1.9909335729117181, "language_loss": 0.81322026, "learning_rate": 6.408949170613187e-10, "loss": 0.83487386, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.5335917472839355 }, { "auxiliary_loss_clip": 0.01133822, "auxiliary_loss_mlp": 0.01025561, "balance_loss_clip": 1.04313517, "balance_loss_mlp": 1.0179975, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 1.5339671870432552, "language_loss": 0.81686127, "learning_rate": 6.213277778144288e-10, "loss": 0.83845508, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.5822854042053223 }, { "auxiliary_loss_clip": 0.01093083, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 1.03903866, "balance_loss_mlp": 1.01973176, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 2.012503473027396, "language_loss": 0.66732574, "learning_rate": 6.020639427224416e-10, "loss": 0.68852174, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 2.647977113723755 }, { "auxiliary_loss_clip": 0.01138293, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.0463984, "balance_loss_mlp": 1.026281, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 1.9179504151595543, "language_loss": 0.72529209, "learning_rate": 5.831034147076864e-10, "loss": 0.74700975, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.56266188621521 }, { "auxiliary_loss_clip": 0.01041809, "auxiliary_loss_mlp": 0.01001653, "balance_loss_clip": 1.00751042, "balance_loss_mlp": 1.00063956, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.6861660800769509, "language_loss": 0.5571776, "learning_rate": 5.644461966463065e-10, "loss": 0.57761222, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.1965014934539795 }, { "auxiliary_loss_clip": 0.01134776, "auxiliary_loss_mlp": 0.01023885, "balance_loss_clip": 1.04640961, "balance_loss_mlp": 1.0176084, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 1.74973038744095, "language_loss": 0.75481474, "learning_rate": 5.460922913687049e-10, "loss": 0.77640134, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.561108112335205 }, { "auxiliary_loss_clip": 0.01104357, "auxiliary_loss_mlp": 0.00761282, "balance_loss_clip": 1.03893089, "balance_loss_mlp": 1.00031126, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 1.9479586282224022, "language_loss": 0.75148511, "learning_rate": 5.280417016593208e-10, "loss": 0.77014148, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.6225130558013916 }, { "auxiliary_loss_clip": 0.01148667, "auxiliary_loss_mlp": 0.00760374, "balance_loss_clip": 1.04801011, "balance_loss_mlp": 1.00031579, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 1.6202572454842186, "language_loss": 0.74849093, "learning_rate": 5.102944302559642e-10, "loss": 0.76758134, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 2.4791719913482666 }, { "auxiliary_loss_clip": 0.01098333, "auxiliary_loss_mlp": 0.01027366, "balance_loss_clip": 1.03991568, "balance_loss_mlp": 1.02008581, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 2.002865524421131, "language_loss": 0.79429346, "learning_rate": 4.9285047985137e-10, "loss": 0.81555045, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.6615512371063232 }, { "auxiliary_loss_clip": 0.0115328, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.04713154, "balance_loss_mlp": 1.01811182, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 1.7540982261873745, "language_loss": 0.74542493, "learning_rate": 4.757098530916436e-10, "loss": 0.76720977, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.5757315158843994 }, { "auxiliary_loss_clip": 0.01151571, "auxiliary_loss_mlp": 0.01034117, "balance_loss_clip": 1.04770994, "balance_loss_mlp": 1.02638054, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 3.153098093849159, "language_loss": 0.76993167, "learning_rate": 4.5887255257670563e-10, "loss": 0.79178852, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 3.2127346992492676 }, { "auxiliary_loss_clip": 0.01161722, "auxiliary_loss_mlp": 0.01027057, "balance_loss_clip": 1.04608214, "balance_loss_mlp": 1.0199374, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 2.02111997817168, "language_loss": 0.76946223, "learning_rate": 4.4233858086117906e-10, "loss": 0.79135007, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.4835519790649414 }, { "auxiliary_loss_clip": 0.01106765, "auxiliary_loss_mlp": 0.01024837, "balance_loss_clip": 1.04657459, "balance_loss_mlp": 1.01760149, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 9.714070513744952, "language_loss": 0.67707002, "learning_rate": 4.261079404528356e-10, "loss": 0.69838601, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.58988356590271 }, { "auxiliary_loss_clip": 0.0114498, "auxiliary_loss_mlp": 0.01028123, "balance_loss_clip": 1.04365969, "balance_loss_mlp": 1.0211997, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 1.7509387414411932, "language_loss": 0.69003582, "learning_rate": 4.1018063381437205e-10, "loss": 0.7117669, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.521165132522583 }, { "auxiliary_loss_clip": 0.01038792, "auxiliary_loss_mlp": 0.01001495, "balance_loss_clip": 1.0084796, "balance_loss_mlp": 1.00050592, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.8672093628831413, "language_loss": 0.6121372, "learning_rate": 3.9455666336141167e-10, "loss": 0.63254005, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 3.1320581436157227 }, { "auxiliary_loss_clip": 0.01163977, "auxiliary_loss_mlp": 0.01024371, "balance_loss_clip": 1.04938245, "balance_loss_mlp": 1.0171324, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 4.226307397096864, "language_loss": 0.82864684, "learning_rate": 3.7923603146450267e-10, "loss": 0.85053033, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.437268018722534 }, { "auxiliary_loss_clip": 0.01120861, "auxiliary_loss_mlp": 0.01027898, "balance_loss_clip": 1.04052806, "balance_loss_mlp": 1.02114749, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 2.0514783637448093, "language_loss": 0.80019104, "learning_rate": 3.642187404473418e-10, "loss": 0.82167864, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.5241494178771973 }, { "auxiliary_loss_clip": 0.01148497, "auxiliary_loss_mlp": 0.01022546, "balance_loss_clip": 1.044119, "balance_loss_mlp": 1.01616549, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.0150538113329417, "language_loss": 0.858181, "learning_rate": 3.495047925885508e-10, "loss": 0.8798914, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 2.484640598297119 }, { "auxiliary_loss_clip": 0.011323, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.04295444, "balance_loss_mlp": 1.02204657, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 2.024992408144145, "language_loss": 0.82485622, "learning_rate": 3.350941901199e-10, "loss": 0.84647274, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.505986452102661 }, { "auxiliary_loss_clip": 0.01137386, "auxiliary_loss_mlp": 0.01028062, "balance_loss_clip": 1.04319561, "balance_loss_mlp": 1.02091837, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 2.370854847107553, "language_loss": 0.83335185, "learning_rate": 3.2098693522764066e-10, "loss": 0.85500634, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 2.5096073150634766 }, { "auxiliary_loss_clip": 0.01140958, "auxiliary_loss_mlp": 0.00760637, "balance_loss_clip": 1.04443812, "balance_loss_mlp": 1.00028133, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 1.9514665098082589, "language_loss": 0.8108741, "learning_rate": 3.071830300516165e-10, "loss": 0.82989013, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 2.5300827026367188 }, { "auxiliary_loss_clip": 0.01155083, "auxiliary_loss_mlp": 0.01032204, "balance_loss_clip": 1.04580247, "balance_loss_mlp": 1.02440202, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 2.536991268510617, "language_loss": 0.70575607, "learning_rate": 2.9368247668615234e-10, "loss": 0.72762895, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.4701550006866455 }, { "auxiliary_loss_clip": 0.01169532, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.05077982, "balance_loss_mlp": 1.02278531, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 2.305085448935493, "language_loss": 0.61534834, "learning_rate": 2.804852771789434e-10, "loss": 0.63735169, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 3.163057327270508 }, { "auxiliary_loss_clip": 0.01160099, "auxiliary_loss_mlp": 0.0102705, "balance_loss_clip": 1.04595995, "balance_loss_mlp": 1.02031183, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 1.7121309036992203, "language_loss": 0.55949914, "learning_rate": 2.675914335321661e-10, "loss": 0.58137071, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 3.2814059257507324 }, { "auxiliary_loss_clip": 0.01154009, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.04646564, "balance_loss_mlp": 1.02187109, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 2.332244309402494, "language_loss": 0.78862119, "learning_rate": 2.550009477018111e-10, "loss": 0.81045628, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.530961513519287 }, { "auxiliary_loss_clip": 0.01135944, "auxiliary_loss_mlp": 0.00761084, "balance_loss_clip": 1.04614568, "balance_loss_mlp": 1.00030708, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 1.8633928903625439, "language_loss": 0.63044238, "learning_rate": 2.4271382159790634e-10, "loss": 0.64941263, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 2.556291341781616 }, { "auxiliary_loss_clip": 0.0110119, "auxiliary_loss_mlp": 0.01030644, "balance_loss_clip": 1.0423528, "balance_loss_mlp": 1.02355993, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 3.100525542981254, "language_loss": 0.85798132, "learning_rate": 2.3073005708429406e-10, "loss": 0.87929964, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.711155891418457 }, { "auxiliary_loss_clip": 0.0111937, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.04615808, "balance_loss_mlp": 1.02336514, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 1.757792554636068, "language_loss": 0.72062683, "learning_rate": 2.190496559788535e-10, "loss": 0.74212229, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.758758306503296 }, { "auxiliary_loss_clip": 0.01132923, "auxiliary_loss_mlp": 0.01028474, "balance_loss_clip": 1.04462183, "balance_loss_mlp": 1.0214076, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 2.175126843485569, "language_loss": 0.76892829, "learning_rate": 2.0767262005372265e-10, "loss": 0.79054219, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 2.534245014190674 }, { "auxiliary_loss_clip": 0.01123476, "auxiliary_loss_mlp": 0.01025329, "balance_loss_clip": 1.04259968, "balance_loss_mlp": 1.01863277, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 2.7156187235500395, "language_loss": 0.75649679, "learning_rate": 1.965989510346322e-10, "loss": 0.77798486, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.5697991847991943 }, { "auxiliary_loss_clip": 0.01102474, "auxiliary_loss_mlp": 0.01024854, "balance_loss_clip": 1.04103434, "balance_loss_mlp": 1.01701009, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 2.1810898914497607, "language_loss": 0.71021485, "learning_rate": 1.8582865060134955e-10, "loss": 0.73148811, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.594902753829956 }, { "auxiliary_loss_clip": 0.01054195, "auxiliary_loss_mlp": 0.01002704, "balance_loss_clip": 1.00752318, "balance_loss_mlp": 1.00170314, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.7765989122426452, "language_loss": 0.55758941, "learning_rate": 1.7536172038790098e-10, "loss": 0.57815844, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.1289587020874023 }, { "auxiliary_loss_clip": 0.0113599, "auxiliary_loss_mlp": 0.01029686, "balance_loss_clip": 1.04614258, "balance_loss_mlp": 1.02268541, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 2.0919306032052023, "language_loss": 0.69284832, "learning_rate": 1.651981619819054e-10, "loss": 0.71450502, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 2.5752103328704834 }, { "auxiliary_loss_clip": 0.01109819, "auxiliary_loss_mlp": 0.01025017, "balance_loss_clip": 1.04335594, "balance_loss_mlp": 1.01850259, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.5778428664027273, "language_loss": 0.7088697, "learning_rate": 1.5533797692546257e-10, "loss": 0.73021805, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 2.6088147163391113 }, { "auxiliary_loss_clip": 0.01145449, "auxiliary_loss_mlp": 0.01024066, "balance_loss_clip": 1.04370773, "balance_loss_mlp": 1.01657939, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 2.075797015525156, "language_loss": 0.84501386, "learning_rate": 1.4578116671404296e-10, "loss": 0.86670899, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.48188853263855 }, { "auxiliary_loss_clip": 0.01144405, "auxiliary_loss_mlp": 0.01029843, "balance_loss_clip": 1.04705262, "balance_loss_mlp": 1.02253258, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 1.8877487381632618, "language_loss": 0.71374524, "learning_rate": 1.3652773279759777e-10, "loss": 0.73548776, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.488166332244873 }, { "auxiliary_loss_clip": 0.01149019, "auxiliary_loss_mlp": 0.01028515, "balance_loss_clip": 1.04534185, "balance_loss_mlp": 1.02081084, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 1.7315404951726763, "language_loss": 0.6282227, "learning_rate": 1.2757767657989305e-10, "loss": 0.64999801, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 3.3093338012695312 }, { "auxiliary_loss_clip": 0.0114846, "auxiliary_loss_mlp": 0.01021393, "balance_loss_clip": 1.04671752, "balance_loss_mlp": 1.01463962, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 2.1406674432266914, "language_loss": 0.87206125, "learning_rate": 1.1893099941850948e-10, "loss": 0.89375979, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.5149528980255127 }, { "auxiliary_loss_clip": 0.01136835, "auxiliary_loss_mlp": 0.01026158, "balance_loss_clip": 1.04242706, "balance_loss_mlp": 1.01902056, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 2.098934167889525, "language_loss": 0.77253771, "learning_rate": 1.105877026252866e-10, "loss": 0.79416764, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 2.54060697555542 }, { "auxiliary_loss_clip": 0.01162864, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.04515636, "balance_loss_mlp": 1.01981258, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 1.867658553150117, "language_loss": 0.72035944, "learning_rate": 1.0254778746565663e-10, "loss": 0.74225712, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.4171926975250244 }, { "auxiliary_loss_clip": 0.01120771, "auxiliary_loss_mlp": 0.0102802, "balance_loss_clip": 1.04482841, "balance_loss_mlp": 1.02209842, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 1.9489265742607471, "language_loss": 0.73119605, "learning_rate": 9.481125515953259e-11, "loss": 0.75268394, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 2.538266897201538 }, { "auxiliary_loss_clip": 0.01107751, "auxiliary_loss_mlp": 0.0102401, "balance_loss_clip": 1.03910947, "balance_loss_mlp": 1.01651525, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 1.8025186974646996, "language_loss": 0.79609084, "learning_rate": 8.737810688064228e-11, "loss": 0.81740844, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 2.6331288814544678 }, { "auxiliary_loss_clip": 0.01113174, "auxiliary_loss_mlp": 0.01026012, "balance_loss_clip": 1.04163408, "balance_loss_mlp": 1.01823914, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 1.9234265745609482, "language_loss": 0.78945726, "learning_rate": 8.024834375608414e-11, "loss": 0.81084907, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.5736336708068848 }, { "auxiliary_loss_clip": 0.01054546, "auxiliary_loss_mlp": 0.01002371, "balance_loss_clip": 1.00784206, "balance_loss_mlp": 1.00128639, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.8171875339577178, "language_loss": 0.62877083, "learning_rate": 7.342196686788149e-11, "loss": 0.64934003, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 2.942809581756592 }, { "auxiliary_loss_clip": 0.01130575, "auxiliary_loss_mlp": 0.01027029, "balance_loss_clip": 1.04650044, "balance_loss_mlp": 1.02004004, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 2.07106461735359, "language_loss": 0.68687475, "learning_rate": 6.689897725142834e-11, "loss": 0.70845073, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.5541605949401855 }, { "auxiliary_loss_clip": 0.01136147, "auxiliary_loss_mlp": 0.01024277, "balance_loss_clip": 1.04398108, "balance_loss_mlp": 1.01738095, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 2.086324605457267, "language_loss": 0.88386971, "learning_rate": 6.067937589615545e-11, "loss": 0.90547401, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 2.5192599296569824 }, { "auxiliary_loss_clip": 0.01038721, "auxiliary_loss_mlp": 0.0100272, "balance_loss_clip": 1.00969887, "balance_loss_mlp": 1.00160491, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7488544704656532, "language_loss": 0.57735109, "learning_rate": 5.476316374575241e-11, "loss": 0.59776545, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 3.1339709758758545 }, { "auxiliary_loss_clip": 0.0116467, "auxiliary_loss_mlp": 0.01035364, "balance_loss_clip": 1.04674149, "balance_loss_mlp": 1.02737725, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 1.853087203551493, "language_loss": 0.7226609, "learning_rate": 4.9150341697723476e-11, "loss": 0.74466121, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.4925825595855713 }, { "auxiliary_loss_clip": 0.01133317, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.04566193, "balance_loss_mlp": 1.0222497, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 1.4798195646132308, "language_loss": 0.66430199, "learning_rate": 4.384091060338768e-11, "loss": 0.68593001, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 3.442920446395874 }, { "auxiliary_loss_clip": 0.01148794, "auxiliary_loss_mlp": 0.01026982, "balance_loss_clip": 1.04579365, "balance_loss_mlp": 1.02058363, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.0628464849608847, "language_loss": 0.73467684, "learning_rate": 3.883487126810081e-11, "loss": 0.75643456, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 3.2611000537872314 }, { "auxiliary_loss_clip": 0.01138198, "auxiliary_loss_mlp": 0.01022379, "balance_loss_clip": 1.04244947, "balance_loss_mlp": 1.01577473, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 1.8141399599732482, "language_loss": 0.79249209, "learning_rate": 3.41322244516995e-11, "loss": 0.81409782, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 3.275376558303833 }, { "auxiliary_loss_clip": 0.01094413, "auxiliary_loss_mlp": 0.01021061, "balance_loss_clip": 1.04030144, "balance_loss_mlp": 1.01451349, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 1.7001666641182565, "language_loss": 0.62839693, "learning_rate": 2.9732970866946925e-11, "loss": 0.64955175, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 2.6849722862243652 }, { "auxiliary_loss_clip": 0.0110661, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.03821993, "balance_loss_mlp": 1.02132726, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 2.214282309943189, "language_loss": 0.78438669, "learning_rate": 2.563711118175327e-11, "loss": 0.80574656, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.5452687740325928 }, { "auxiliary_loss_clip": 0.01115443, "auxiliary_loss_mlp": 0.01024877, "balance_loss_clip": 1.04265857, "balance_loss_mlp": 1.01854646, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 1.7804971820713567, "language_loss": 0.8384949, "learning_rate": 2.184464601717728e-11, "loss": 0.85989809, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.5694355964660645 }, { "auxiliary_loss_clip": 0.01154059, "auxiliary_loss_mlp": 0.01023873, "balance_loss_clip": 1.04924834, "balance_loss_mlp": 1.01661325, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 2.1957517949029235, "language_loss": 0.76946533, "learning_rate": 1.8355575948758585e-11, "loss": 0.79124469, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.4894847869873047 }, { "auxiliary_loss_clip": 0.011322, "auxiliary_loss_mlp": 0.01025451, "balance_loss_clip": 1.04084325, "balance_loss_mlp": 1.01835752, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 2.5588275736766932, "language_loss": 0.73392224, "learning_rate": 1.5169901505407424e-11, "loss": 0.75549883, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 2.5369577407836914 }, { "auxiliary_loss_clip": 0.01132256, "auxiliary_loss_mlp": 0.01027353, "balance_loss_clip": 1.04385328, "balance_loss_mlp": 1.02048659, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 1.7283166706903617, "language_loss": 0.74044228, "learning_rate": 1.228762317073695e-11, "loss": 0.76203841, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.557877779006958 }, { "auxiliary_loss_clip": 0.01132076, "auxiliary_loss_mlp": 0.01022001, "balance_loss_clip": 1.04345918, "balance_loss_mlp": 1.01535249, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 1.9077190797309398, "language_loss": 0.78826797, "learning_rate": 9.70874138195299e-12, "loss": 0.80980873, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.6038830280303955 }, { "auxiliary_loss_clip": 0.01163246, "auxiliary_loss_mlp": 0.01030927, "balance_loss_clip": 1.04632676, "balance_loss_mlp": 1.02279723, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 1.4680620295247162, "language_loss": 0.74751639, "learning_rate": 7.433256530076093e-12, "loss": 0.76945812, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.449739456176758 }, { "auxiliary_loss_clip": 0.01110433, "auxiliary_loss_mlp": 0.01022359, "balance_loss_clip": 1.0399704, "balance_loss_mlp": 1.01621079, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 2.1119011294751187, "language_loss": 0.75963062, "learning_rate": 5.46116896038562e-12, "loss": 0.78095847, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 2.5589561462402344 }, { "auxiliary_loss_clip": 0.01129165, "auxiliary_loss_mlp": 0.01028657, "balance_loss_clip": 1.04262745, "balance_loss_mlp": 1.02138269, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 2.219431837669009, "language_loss": 0.61769891, "learning_rate": 3.792478972197699e-12, "loss": 0.6392771, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.715505361557007 }, { "auxiliary_loss_clip": 0.01161415, "auxiliary_loss_mlp": 0.0102128, "balance_loss_clip": 1.04585695, "balance_loss_mlp": 1.01499796, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 2.128951955632126, "language_loss": 0.6996817, "learning_rate": 2.4271868181990895e-12, "loss": 0.72150862, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.4336795806884766 }, { "auxiliary_loss_clip": 0.01149744, "auxiliary_loss_mlp": 0.01022448, "balance_loss_clip": 1.0444603, "balance_loss_mlp": 1.01545048, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 2.170793311852865, "language_loss": 0.81140238, "learning_rate": 1.3652927060014973e-12, "loss": 0.83312428, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 3.2025845050811768 }, { "auxiliary_loss_clip": 0.01120791, "auxiliary_loss_mlp": 0.01026921, "balance_loss_clip": 1.04264176, "balance_loss_mlp": 1.01942301, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 1.8571373452598983, "language_loss": 0.63931102, "learning_rate": 6.067967965872612e-13, "loss": 0.66078806, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.5529634952545166 }, { "auxiliary_loss_clip": 0.01119658, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.04478836, "balance_loss_mlp": 1.02894199, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 1.5538371306867649, "language_loss": 0.7709533, "learning_rate": 1.5169920497548615e-13, "loss": 0.79251122, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 2.9255104064941406 }, { "auxiliary_loss_clip": 0.01102091, "auxiliary_loss_mlp": 0.01017494, "balance_loss_clip": 1.02693558, "balance_loss_mlp": 1.0130868, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.1533668498728562, "language_loss": 0.54982793, "learning_rate": 0.0, "loss": 0.57102382, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.0551352500915527 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7889158849090611, "train_runtime": 23734.1053, "train_samples_per_second": 14.016, "train_steps_per_second": 0.35 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }